diff --git a/envoy/common/interval_set.h b/envoy/common/interval_set.h index 51ca2068f3091..8cfe303cf20f8 100644 --- a/envoy/common/interval_set.h +++ b/envoy/common/interval_set.h @@ -37,6 +37,14 @@ template class IntervalSet { * Clears the contents of the interval set. */ virtual void clear() PURE; + + /** + * Determines whether the specified Value is in any of the intervals. + * + * @param value the value + * @return true if value is covered in the inteval set. + */ + virtual bool test(Value value) const PURE; }; } // namespace Envoy diff --git a/source/common/common/utility.h b/source/common/common/utility.h index 5084dbff92733..54ce19af415bf 100644 --- a/source/common/common/utility.h +++ b/source/common/common/utility.h @@ -561,6 +561,11 @@ template class IntervalSetImpl : public IntervalSet { intervals_.insert(Interval(left, right)); } + bool test(Value value) const override { + const auto left_pos = intervals_.lower_bound(Interval(value, value + 1)); + return left_pos != intervals_.end() && value >= left_pos->first && value < left_pos->second; + } + std::vector toVector() const override { return std::vector(intervals_.begin(), intervals_.end()); } diff --git a/source/common/json/BUILD b/source/common/json/BUILD index 1a42cd0004385..fe750b28c3824 100644 --- a/source/common/json/BUILD +++ b/source/common/json/BUILD @@ -34,3 +34,10 @@ envoy_cc_library( "//source/common/runtime:runtime_features_lib", ], ) + +envoy_cc_library( + name = "json_sanitizer_lib", + srcs = ["json_sanitizer.cc"], + hdrs = ["json_sanitizer.h"], + deps = ["//source/common/common:assert_lib"], +) diff --git a/source/common/json/json_internal.cc b/source/common/json/json_internal.cc index 4b480831c6c19..7980edb14a644 100644 --- a/source/common/json/json_internal.cc +++ b/source/common/json/json_internal.cc @@ -686,6 +686,11 @@ ObjectSharedPtr Factory::loadFromString(const std::string& json) { return handler.getRoot(); } +std::string Factory::serialize(absl::string_view str) { + nlohmann::json j(str); + return j.dump(); +} + } // namespace Nlohmann } // namespace Json } // namespace Envoy diff --git a/source/common/json/json_internal.h b/source/common/json/json_internal.h index de665333a1c09..686430826a7d4 100644 --- a/source/common/json/json_internal.h +++ b/source/common/json/json_internal.h @@ -5,6 +5,8 @@ #include "envoy/json/json_object.h" +#include "absl/strings/string_view.h" + namespace Envoy { namespace Json { namespace Nlohmann { @@ -15,6 +17,8 @@ class Factory { * Constructs a Json Object from a string. */ static ObjectSharedPtr loadFromString(const std::string& json); + + static std::string serialize(absl::string_view str); }; } // namespace Nlohmann diff --git a/source/common/json/json_sanitizer.cc b/source/common/json/json_sanitizer.cc new file mode 100644 index 0000000000000..5ca322b2993d5 --- /dev/null +++ b/source/common/json/json_sanitizer.cc @@ -0,0 +1,255 @@ +#include "source/common/json/json_sanitizer.h" + +#include + +#include "source/common/common/assert.h" + +#include "absl/strings/str_cat.h" +#include "absl/strings/str_format.h" + +namespace Envoy { +namespace Json { + +namespace { + +const uint8_t Literal = 0; +const uint8_t ControlEscapeSize = 2; // e.g. \b +const uint8_t UnicodeEscapeSize = 6; // e.g. \u1234 +const uint8_t Utf8DecodeSentinel = 0xff; + +} // namespace + +JsonSanitizer::JsonSanitizer() { + // Single-char escape sequences for common control characters. + auto symbolic_escape = [this](char control_char, char symbolic) { + Escape& escape = char_escapes_[char2uint32(control_char)]; + escape.size_ = ControlEscapeSize; + escape.chars_[0] = '\\'; + escape.chars_[1] = symbolic; + }; + symbolic_escape('\b', 'b'); + symbolic_escape('\f', 'f'); + symbolic_escape('\n', 'n'); + symbolic_escape('\r', 'r'); + symbolic_escape('\t', 't'); + symbolic_escape('\\', '\\'); + symbolic_escape('"', '"'); + + // Low characters (0-31) not listed above are encoded as unicode 4-digit hex. + auto unicode_escape = [this](uint32_t index) { + // We capture unicode Escapes both in a char-indexed array, for direct + // substitutions on literal inputs, and in a unicode-indexed hash-map, + // for lookup after utf8 decode. + std::string escape_str = absl::StrFormat("\\u%04x", index); + ASSERT(escape_str.size() == UnicodeEscapeSize); + Escape& escape = unicode_escapes_[index]; + escape.size_ = escape_str.size(); + RELEASE_ASSERT(escape.size_ <= sizeof(escape.chars_), "escaped string too large"); + memcpy(escape.chars_, escape_str.data(), escape_str.size()); // NOLINT(safe-memcpy)*/ + if (index < NumEscapes) { + char_escapes_[index] = escape; + } + }; + + // Add unicode escapes for control-characters below 32 that don't have symbolic escapes. + for (uint32_t i = 0; i < ' '; ++i) { + if (char_escapes_[i].size_ == 0) { + unicode_escape(i); + } + } + + // Unicode-escaped ascii constants above SPACE (32). + for (char ch : {'<', '>', '\177'}) { + unicode_escape(char2uint32(ch)); + } + + // There's a range of 8-bit characters that are unicode escaped by the + // protobuf library, so we match behavior. + for (uint32_t i = 0x0080; i < 0x00a0; ++i) { + unicode_escape(i); + } + + // The remaining unicode characters are mostly passed through literally. We'll + // initialize all of them and then override some below. + for (uint32_t i = 0x00a0; i < NumEscapes; ++i) { + char_escapes_[i].size_ = Literal; + } + + // All the bytes matching pattern 11xxxxxx will be evaluated as utf-8. + for (uint32_t i = Utf8_2BytePattern; i <= 0xff; ++i) { + char_escapes_[i].size_ = Utf8DecodeSentinel; + } + + // There are an assortment of unicode characters that protobufs quote, so we + // do likewise here to make differential testing/fuzzing feasible. + for (uint32_t i : {0x00ad, 0x0600, 0x0601, 0x0602, 0x0603, 0x06dd, 0x070f}) { + unicode_escape(i); + } +} + +absl::string_view JsonSanitizer::sanitize(std::string& buffer, absl::string_view str) const { + // Fast-path to see whether any escapes or utf-encoding are needed. If str has + // only unescaped ascii characters, we can simply return it. So before doing + // anything too fancy, do a lookup in char_escapes_ for each character, and + // simply OR in the return sizes. We use 0 for the return-size when we are + // simply leaving the character as is, so anything non-zero means we need to + // initiate the slow path. + // + // Benchmarks show it's faster to just rip through the string with no + // conditionals, so we only check the ORed sizes after the loop. This avoids + // branches and allows simpler loop unrolling by the compiler. + uint32_t sizes_ored_together = 0; + for (char c : str) { + sizes_ored_together |= char_escapes_[char2uint32(c)].size_; + } + if (sizes_ored_together == 0) { + return str; // Happy path, should be executed most of the time. + } + return slowSanitize(buffer, str); +} + +absl::string_view JsonSanitizer::slowSanitize(std::string& buffer, absl::string_view str) const { + std::string oct_escape_buf; + size_t past_escape = absl::string_view::npos; + const uint8_t* first = reinterpret_cast(str.data()); + const uint8_t* data = first; + absl::string_view escape_view; + for (uint32_t n = str.size(); n != 0; ++data, --n) { + const Escape& escape = char_escapes_[*data]; + if (escape.size_ != Literal) { + uint32_t start_of_escape = data - first; + switch (escape.size_) { + case ControlEscapeSize: + case UnicodeEscapeSize: + escape_view = absl::string_view(escape.chars_, escape.size_); + break; + case Utf8DecodeSentinel: { + auto [unicode, consumed] = decodeUtf8(data, n); + if (consumed != 0) { + --consumed; + data += consumed; + n -= consumed; + + // Having validated and constructed the unicode for the utf-8 + // sequence we must determine whether to render it literally by + // simply leaving it alone, or whether we ought to render it + // as a unicode escape. We do this using a hash-map set up during + // the constructor with all desired unicode escapes, to mimic the + // behavior of the protobuf json serializer. + auto iter = unicode_escapes_.find(unicode); + if (iter == unicode_escapes_.end()) { + continue; + } + escape_view = absl::string_view(iter->second.chars_, iter->second.size_); + } else { + // Using StrFormat during decode seems slow, but this case should be + // rare. + oct_escape_buf = absl::StrFormat("\\%03o", *data); + escape_view = absl::string_view(oct_escape_buf); + } + break; + } + default: + ASSERT(false); + } + + if (past_escape == absl::string_view::npos) { + // We only initialize buffer when we first learn we need to add an + // escape-sequence to the sanitized string. + if (start_of_escape == 0) { + // The first character is an escape, and 'buffer' has not been cleared yet, + // so we need to assign it rather than append to it. + buffer.assign(escape_view.data(), escape_view.size()); + } else { + // We found our first escape, but this is not the first character in the + // string, so we combine the unescaped characters in the string we already + // looped over with the new escaped character. + buffer = absl::StrCat(str.substr(0, start_of_escape), escape_view); + } + } else if (start_of_escape == past_escape) { + // We are adding an escape immediately after another escaped character. + absl::StrAppend(&buffer, escape_view); + } else { + // We are adding a new escape but must first cover the characters + // encountered since the previous escape. + absl::StrAppend(&buffer, str.substr(past_escape, start_of_escape - past_escape), + escape_view); + } + past_escape = data - first + 1; + } + } + + // If no escape-sequence was needed, we just return the input. + if (past_escape == absl::string_view::npos) { + return str; + } + + // Otherwise we append on any unescaped chunk at the end of the input, and + // return buffer as the result. + if (past_escape < str.size()) { + absl::StrAppend(&buffer, str.substr(past_escape, str.size() - past_escape)); + } + return buffer; +} + +std::pair JsonSanitizer::decodeUtf8(const uint8_t* bytes, uint32_t size) { + uint32_t unicode = 0; + uint32_t consumed = 0; + + // See table in https://en.wikipedia.org/wiki/UTF-8, "Encoding" section. + // + // See also https://en.cppreference.com/w/cpp/locale/codecvt_utf8 which is + // marked as deprecated. There is also support in Windows libraries and Boost, + // which can be discovered on StackOverflow. I could not find a usable OSS + // implementation. However it's easily derived from the spec on Wikipedia. + // + // Note that the code below could be optimized a bit, e.g. by factoring out + // repeated lookups of the same index in the bytes array and using SSE + // instructions for the multi-word bit hacking. + // + // See also http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ which might be a lot + // faster, though less readable. As coded, though, it looks like it would read + // past the end of the input if the input is malformed. + if (size >= 2 && (bytes[0] & Utf8_2ByteMask) == Utf8_2BytePattern && + (bytes[1] & Utf8_ContinueMask) == Utf8_ContinuePattern) { + unicode = bytes[0] & ~Utf8_2ByteMask; + unicode = (unicode << Utf8_Shift) | (bytes[1] & ~Utf8_ContinueMask); + if (unicode < 0x80) { + return UnicodeSizePair(0, 0); + } + consumed = 2; + } else if (size >= 3 && (bytes[0] & Utf8_3ByteMask) == Utf8_3BytePattern && + (bytes[1] & Utf8_ContinueMask) == Utf8_ContinuePattern && + (bytes[2] & Utf8_ContinueMask) == Utf8_ContinuePattern) { + unicode = bytes[0] & ~Utf8_3ByteMask; + unicode = (unicode << Utf8_Shift) | (bytes[1] & ~Utf8_ContinueMask); + unicode = (unicode << Utf8_Shift) | (bytes[2] & ~Utf8_ContinueMask); + if (unicode < 0x800) { // 3-byte starts at 0x800 + return UnicodeSizePair(0, 0); + } + consumed = 3; + } else if (size >= 4 && (bytes[0] & Utf8_4ByteMask) == Utf8_4BytePattern && + (bytes[1] & Utf8_ContinueMask) == Utf8_ContinuePattern && + (bytes[2] & Utf8_ContinueMask) == Utf8_ContinuePattern && + (bytes[3] & Utf8_ContinueMask) == Utf8_ContinuePattern) { + unicode = bytes[0] & ~Utf8_4ByteMask; + unicode = (unicode << Utf8_Shift) | (bytes[1] & ~Utf8_ContinueMask); + unicode = (unicode << Utf8_Shift) | (bytes[2] & ~Utf8_ContinueMask); + unicode = (unicode << Utf8_Shift) | (bytes[3] & ~Utf8_ContinueMask); + + // 4-byte starts at 0x10000 + // + // Note from https://en.wikipedia.org/wiki/UTF-8: + // The earlier RFC2279 allowed UTF-8 encoding through code point U+7FFFFFF. + // But the current RFC3629 section 3 limits UTF-8 encoding through code + // point U+10FFFF, to match the limits of UTF-16. + if (unicode < 0x10000 || unicode > 0x10ffff) { + return UnicodeSizePair(0, 0); + } + consumed = 4; + } + return UnicodeSizePair(unicode, consumed); +} + +} // namespace Json +} // namespace Envoy diff --git a/source/common/json/json_sanitizer.h b/source/common/json/json_sanitizer.h new file mode 100644 index 0000000000000..7a175ce86d741 --- /dev/null +++ b/source/common/json/json_sanitizer.h @@ -0,0 +1,93 @@ +#pragma once + +#include + +#include "absl/container/flat_hash_map.h" +#include "absl/strings/string_view.h" + +namespace Envoy { +namespace Json { + +// Hand-rolled JSON sanitizer that has exactly the same behavior as serializing +// through protobufs, but is more than 10x faster. From +// test/common/json/json_sanitizer_speed_test.cc: +// +// --------------------------------------------------------------------------- +// Benchmark Time CPU Iterations +// --------------------------------------------------------------------------- +// BM_ProtoEncoderNoEscape 1123 ns 1123 ns 545345 +// BM_JsonSanitizerNoEscape 8.77 ns 8.77 ns 79517538 +// BM_StaticJsonSanitizerNoEscape 9.52 ns 9.52 ns 73570603 +// BM_ProtoEncoderWithEscape 1326 ns 1326 ns 528576 +// BM_JsonSanitizerWithEscape 96.3 ns 96.3 ns 7289627 +// BM_StaticJsonSanitizerWithEscape 97.5 ns 97.5 ns 7157098 +// +class JsonSanitizer { +public: + static constexpr uint32_t Utf8_2ByteMask = 0b11100000; + static constexpr uint32_t Utf8_3ByteMask = 0b11110000; + static constexpr uint32_t Utf8_4ByteMask = 0b11111000; + + static constexpr uint32_t Utf8_2BytePattern = 0b11000000; + static constexpr uint32_t Utf8_3BytePattern = 0b11100000; + static constexpr uint32_t Utf8_4BytePattern = 0b11110000; + + static constexpr uint32_t Utf8_ContinueMask = 0b11000000; + static constexpr uint32_t Utf8_ContinuePattern = 0b10000000; + + static constexpr uint32_t Utf8_Shift = 6; + + // Constructing the sanitizer fills in a table with all escape-sequences, + // indexed by character. To make this perform well, you should instantiate the + // sanitizer in a context that lives across a large number of sanitizations. + JsonSanitizer(); + + /** + * Sanitizes a string so it is suitable for JSON. The buffer is + * used if any of the characters in str need to be escaped. + * + * @param buffer a string in which an escaped string can be written, if needed. It + * is not necessary for callers to clear the buffer first; it be cleared + * by this method if the input needs to be escaped. + * @param str the string to be translated + * @return the translated string_view. + */ + absl::string_view sanitize(std::string& buffer, absl::string_view str) const; + + /** The Unicode code-point and the number of utf8-bytes consumed */ + using UnicodeSizePair = std::pair; + + /** + * Decodes a byte-stream of UTF8, returning the resulting unicode and the + * number of bytes consumed as a pair. + * + * @param bytes The data with utf8 bytes. + * @param size The number of bytes available in data + * @return UnicodeSizePair(unicode, consumed) -- if the decode fails consumed will be 0. + */ + static UnicodeSizePair decodeUtf8(const uint8_t* bytes, uint32_t size); + +private: + // static constexpr uint32_t NumEscapes = 1 << 11; // 2^11=2048 codes possible in 2-byte utf8. + static constexpr uint32_t NumEscapes = 256; + + // Character-indexed array of translation strings. If an entry is nullptr then + // the character does not require substitution. This strategy is dependent on + // the property of UTF-8 where all two-byte characters have the high-order bit + // set for both bytes, and don't require escaping for JSON. Thus we can + // consider each character in isolation for escaping. Reference: + // https://en.wikipedia.org/wiki/UTF-8. + struct Escape { + uint8_t size_{0}; + char chars_[7]; // No need to initialize char data, as we are not null-terminating. + }; + + static uint32_t char2uint32(char c) { return static_cast(static_cast(c)); } + absl::string_view slowSanitize(std::string& buffer, absl::string_view str) const; + + Escape char_escapes_[NumEscapes]; + absl::flat_hash_map unicode_escapes_; +}; + +} // namespace Json +} // namespace Envoy diff --git a/test/common/common/utility_test.cc b/test/common/common/utility_test.cc index 274ab220dd388..6f679d8dbc458 100644 --- a/test/common/common/utility_test.cc +++ b/test/common/common/utility_test.cc @@ -931,6 +931,21 @@ TEST(IntervalSet, testIntervalTargeted) { EXPECT_EQ("[15, 20), [25, 30), [35, 40), [41, 43)", test(41, 43)); } +TEST(IntervalSet, testTest) { + IntervalSetImpl set; + set.insert(4, 6); + EXPECT_FALSE(set.test(0)); + set.insert(0, 2); + EXPECT_TRUE(set.test(0)); + EXPECT_TRUE(set.test(1)); + EXPECT_FALSE(set.test(2)); + EXPECT_FALSE(set.test(3)); + EXPECT_TRUE(set.test(4)); + EXPECT_TRUE(set.test(5)); + EXPECT_FALSE(set.test(6)); + EXPECT_FALSE(set.test(7)); +} + TEST(WelfordStandardDeviation, AllEntriesTheSame) { WelfordStandardDeviation wsd; wsd.update(10); diff --git a/test/common/json/BUILD b/test/common/json/BUILD index ebb6d6aeb63a4..7eafb0f631b70 100644 --- a/test/common/json/BUILD +++ b/test/common/json/BUILD @@ -1,7 +1,10 @@ load( "//bazel:envoy_build_system.bzl", + "envoy_cc_benchmark_binary", + "envoy_cc_binary", "envoy_cc_fuzz_test", "envoy_cc_test", + "envoy_cc_test_library", "envoy_package", ) @@ -36,3 +39,98 @@ envoy_cc_test( "//test/test_common:utility_lib", ], ) + +envoy_cc_test( + name = "json_sanitizer_test", + srcs = ["json_sanitizer_test.cc"], + deps = [ + ":json_sanitizer_test_util_lib", + "//source/common/json:json_internal_lib", + "//source/common/json:json_sanitizer_lib", + "//source/common/protobuf:utility_lib", + ], +) + +envoy_cc_benchmark_binary( + name = "json_sanitizer_speed_test", + srcs = ["json_sanitizer_speed_test.cc"], + deps = [ + "//source/common/json:json_internal_lib", + "//source/common/json:json_sanitizer_lib", + "//source/common/protobuf:utility_lib", + ], +) + +envoy_cc_fuzz_test( + name = "json_sanitizer_fuzz_test", + srcs = ["json_sanitizer_fuzz_test.cc"], + corpus = "json_sanitizer_corpus", + deps = [ + ":json_sanitizer_test_util_lib", + "//source/common/json:json_sanitizer_lib", + "//source/common/protobuf:utility_lib", + "//test/fuzz:utility_lib", + ], +) + +envoy_cc_binary( + name = "gen_excluded_unicodes", + srcs = ["gen_excluded_unicodes.cc"], + deps = [ + "//source/common/json:json_sanitizer_lib", + "//source/common/protobuf:utility_lib", + ], +) + +#genrule( +# name = "extensions_security_rst", +# srcs = [ +# "//source/extensions:extensions_metadata.yaml", +# "//contrib:extensions_metadata.yaml", +# ], +# outs = ["extensions_security_rst.tar"], +# cmd = """ +# $(location //tools/docs:generate_extensions_security_rst) \\ +# $(location //source/extensions:extensions_metadata.yaml) \\ +# $(location //contrib:extensions_metadata.yaml) $@ +# """, +# tools = ["//tools/docs:generate_extensions_security_rst"], +#) +# +#envoy_directory_genrule( +# name = "corpus_from_config_impl", +# testonly = 1, +# srcs = [ +# # This is deliberately in srcs, since we run into host/target confusion +# # otherwise in oss-fuzz builds. +# ":config_impl_test_static", +# ], +# cmd = " ".join([ +# "$(location corpus_from_config_impl_sh)", +# "$(location //test/common/router:config_impl_test_static)", +# ]), +# tools = [":corpus_from_config_impl_sh"], +#) +# +#genrule( +# name = "generate_excluded_unicodes", +# srcs = [ +# "admin_head_start.html", +# "admin.css", +# ], +# outs = ["admin_html_gen.h"], +# cmd = "./$(location :generate_admin_html.sh) \ +# $(location admin_head_start.html) $(location admin.css) > $@", +# visibility = ["//visibility:private"], +# deps = [":generate_excluded_unicodes"], +#) + +envoy_cc_test_library( + name = "json_sanitizer_test_util_lib", + srcs = ["json_sanitizer_test_util.cc"], + hdrs = ["json_sanitizer_test_util.h"], + deps = [ + "//source/common/common:utility_lib", + "//source/common/json:json_sanitizer_lib", + ], +) diff --git a/test/common/json/gen_excluded_unicodes.cc b/test/common/json/gen_excluded_unicodes.cc new file mode 100644 index 0000000000000..fbd68237f6aaf --- /dev/null +++ b/test/common/json/gen_excluded_unicodes.cc @@ -0,0 +1,130 @@ +#include "source/common/json/json_sanitizer.h" +#include "source/common/protobuf/utility.h" + +#include "absl/strings/str_format.h" + +namespace Envoy { +namespace Json { + +// Collects unicode values that cannot be handled by the protobuf json encoder. +// This is not needed for correct operation of the json sanitizer, but it is +// needed for comparing sanitization results against the proto serializer, and +// for differential fuzzing. We need to avoid comparing sanitization results for +// strings containing utf-8 sequences that protobufs cannot serialize. +// +// Normally when running tests, nothing will be passed to collect(), and emit() +// will return false. But if the protobuf library changes and different unicode +// sets become invalid, we can re-run the collector with: +// +// bazel build -c opt test/common/json:json_sanitizer_test +// GENERATE_INVALID_UTF8_RANGES=1 +// ./bazel-bin/test/common/json/json_sanitizer_test |& +// grep -v 'contains invalid UTF-8' +// +// The grep pipe is essential as otherwise you will be buried in thousands of +// messages from the protobuf library that cannot otherwise be trapped. The +// "-c opt" is essential because JsonSanitizerTest.AllFourByteUtf8 iterates over +// all 4-byte sequences which takes almost 20 seconds without optimization, so +// it is conditionally compiled on NDEBUG. +// +// Running in this mode causes two tests to fail, but prints two initialization +// blocks for invalid byte code ranges, which can then be pasted into the +// InvalidUnicodeSet constructor in json_sanitizer_test_util.cc. +class InvalidUnicodeCollector { +public: + /** + * Collects a unicode value that cannot be parsed as utf8 by the protobuf serializer. + * + * @param unicode the unicode value + */ + void collect(uint32_t unicode) { invalid_.insert(unicode, unicode + 1); } + + /** + * Emits the collection of invalid unicode ranges to stdout. + * + * @return true if any invalid ranges were found. + */ + bool emit(absl::string_view variable_name) { + bool has_invalid = false; + for (IntervalSet::Interval& interval : invalid_.toVector()) { + has_invalid = true; + std::cout << absl::StrFormat(" %s.insert(0x%x, 0x%x);\n", variable_name, interval.first, + interval.second); + } + return has_invalid; + } + +private: + IntervalSetImpl invalid_; +}; + +bool isInvalidProtobufSerialization(const std::string& str) { + return str.size() == 2 && str[0] == '"' && str[1] == '"'; +} + +void AllThreeByteUtf8() { + std::string utf8("abc"); + InvalidUnicodeCollector invalid; + + for (uint32_t byte1 = 0; byte1 < 16; ++byte1) { + utf8[0] = byte1 | JsonSanitizer::Utf8_3BytePattern; + for (uint32_t byte2 = 0; byte2 < 64; ++byte2) { + utf8[1] = byte2 | JsonSanitizer::Utf8_ContinuePattern; + for (uint32_t byte3 = 0; byte3 < 64; ++byte3) { + utf8[2] = byte3 | JsonSanitizer::Utf8_ContinuePattern; + auto [unicode, consumed] = Envoy::Json::JsonSanitizer::decodeUtf8( + reinterpret_cast(utf8.data()), 3); + if (consumed == 3) { + std::string proto_sanitized = + MessageUtil::getJsonStringFromMessageOrDie(ValueUtil::stringValue(utf8), false, true); + if (isInvalidProtobufSerialization(proto_sanitized)) { + invalid.collect(unicode); + } + } else { + ASSERT(consumed == 0); + } + } + } + } + + invalid.emit("invalid_3byte_intervals_"); +} + +void AllFourByteUtf8() { + std::string utf8("abcd"); + InvalidUnicodeCollector invalid; + + for (uint32_t byte1 = 0; byte1 < 16; ++byte1) { + utf8[0] = byte1 | JsonSanitizer::Utf8_4BytePattern; + for (uint32_t byte2 = 0; byte2 < 64; ++byte2) { + utf8[1] = byte2 | JsonSanitizer::Utf8_ContinuePattern; + for (uint32_t byte3 = 0; byte3 < 64; ++byte3) { + utf8[2] = byte3 | JsonSanitizer::Utf8_ContinuePattern; + for (uint32_t byte4 = 0; byte4 < 64; ++byte4) { + utf8[3] = byte4 | JsonSanitizer::Utf8_ContinuePattern; + auto [unicode, consumed] = Envoy::Json::JsonSanitizer::decodeUtf8( + reinterpret_cast(utf8.data()), 4); + if (consumed == 4) { + std::string proto_sanitized = MessageUtil::getJsonStringFromMessageOrDie( + ValueUtil::stringValue(utf8), false, true); + if (isInvalidProtobufSerialization(proto_sanitized)) { + invalid.collect(unicode); + } + } else { + ASSERT(consumed == 0); + } + } + } + } + } + invalid.emit("invalid_4byte_intervals_"); +} + +} // namespace Json +} // namespace Envoy + +int main() { + Envoy::Json::AllThreeByteUtf8(); + Envoy::Json::AllFourByteUtf8(); + return 0; +} diff --git a/test/common/json/json_sanitizer_corpus/binary_file b/test/common/json/json_sanitizer_corpus/binary_file new file mode 100644 index 0000000000000..eb70d74b0caaf Binary files /dev/null and b/test/common/json/json_sanitizer_corpus/binary_file differ diff --git a/test/common/json/json_sanitizer_corpus/hello_world_multi_language b/test/common/json/json_sanitizer_corpus/hello_world_multi_language new file mode 100644 index 0000000000000..33a23898ed30c --- /dev/null +++ b/test/common/json/json_sanitizer_corpus/hello_world_multi_language @@ -0,0 +1 @@ +Hello world, Καλημέρα κόσμε, コンニチハ \ No newline at end of file diff --git a/test/common/json/json_sanitizer_corpus/lower_case b/test/common/json/json_sanitizer_corpus/lower_case new file mode 100644 index 0000000000000..e85d5b45283ac --- /dev/null +++ b/test/common/json/json_sanitizer_corpus/lower_case @@ -0,0 +1 @@ +abcdefghijklmnopqrstuvwxyz \ No newline at end of file diff --git a/test/common/json/json_sanitizer_corpus/one_quote_begin b/test/common/json/json_sanitizer_corpus/one_quote_begin new file mode 100644 index 0000000000000..4977cad19bb50 --- /dev/null +++ b/test/common/json/json_sanitizer_corpus/one_quote_begin @@ -0,0 +1 @@ +"ab \ No newline at end of file diff --git a/test/common/json/json_sanitizer_corpus/one_quote_end b/test/common/json/json_sanitizer_corpus/one_quote_end new file mode 100644 index 0000000000000..d897f58692ae6 --- /dev/null +++ b/test/common/json/json_sanitizer_corpus/one_quote_end @@ -0,0 +1 @@ +ab" \ No newline at end of file diff --git a/test/common/json/json_sanitizer_corpus/one_quote_middle b/test/common/json/json_sanitizer_corpus/one_quote_middle new file mode 100644 index 0000000000000..68329fbda325e --- /dev/null +++ b/test/common/json/json_sanitizer_corpus/one_quote_middle @@ -0,0 +1 @@ +a"b \ No newline at end of file diff --git a/test/common/json/json_sanitizer_corpus/punctuation b/test/common/json/json_sanitizer_corpus/punctuation new file mode 100644 index 0000000000000..00a387e30ea7f --- /dev/null +++ b/test/common/json/json_sanitizer_corpus/punctuation @@ -0,0 +1 @@ +" `~!@#$%^&*()_+-={}|[]" \ No newline at end of file diff --git a/test/common/json/json_sanitizer_corpus/quotes_both_ends b/test/common/json/json_sanitizer_corpus/quotes_both_ends new file mode 100644 index 0000000000000..075c842d1b8cd --- /dev/null +++ b/test/common/json/json_sanitizer_corpus/quotes_both_ends @@ -0,0 +1 @@ +"a" \ No newline at end of file diff --git a/test/common/json/json_sanitizer_corpus/two_quotes_begin b/test/common/json/json_sanitizer_corpus/two_quotes_begin new file mode 100644 index 0000000000000..a261b18bf58be --- /dev/null +++ b/test/common/json/json_sanitizer_corpus/two_quotes_begin @@ -0,0 +1 @@ +""ab \ No newline at end of file diff --git a/test/common/json/json_sanitizer_corpus/two_quotes_end b/test/common/json/json_sanitizer_corpus/two_quotes_end new file mode 100644 index 0000000000000..2e95ab1bcedf9 --- /dev/null +++ b/test/common/json/json_sanitizer_corpus/two_quotes_end @@ -0,0 +1 @@ +ab"" \ No newline at end of file diff --git a/test/common/json/json_sanitizer_corpus/two_quotes_middle b/test/common/json/json_sanitizer_corpus/two_quotes_middle new file mode 100644 index 0000000000000..a45fcdef3dd6d --- /dev/null +++ b/test/common/json/json_sanitizer_corpus/two_quotes_middle @@ -0,0 +1 @@ +a""b \ No newline at end of file diff --git a/test/common/json/json_sanitizer_corpus/upper_case b/test/common/json/json_sanitizer_corpus/upper_case new file mode 100644 index 0000000000000..a6860d918dfcb --- /dev/null +++ b/test/common/json/json_sanitizer_corpus/upper_case @@ -0,0 +1 @@ +ABCDEFGHIJKLMNOPQRSTUVWXYZ \ No newline at end of file diff --git a/test/common/json/json_sanitizer_fuzz_test.cc b/test/common/json/json_sanitizer_fuzz_test.cc new file mode 100644 index 0000000000000..93df5a03aba6e --- /dev/null +++ b/test/common/json/json_sanitizer_fuzz_test.cc @@ -0,0 +1,52 @@ +#include "source/common/json/json_sanitizer.h" +#include "source/common/protobuf/utility.h" + +#include "test/common/json/json_sanitizer_test_util.h" +#include "test/fuzz/fuzz_runner.h" +#include "test/fuzz/utility.h" +#include "test/test_common/utility.h" + +#include "absl/strings/str_format.h" + +namespace Envoy { +namespace Fuzz { + +const Envoy::Json::JsonSanitizer& staticSanitizer() { + CONSTRUCT_ON_FIRST_USE(Envoy::Json::JsonSanitizer); +} + +DEFINE_FUZZER(const uint8_t* buf, size_t len) { + const Envoy::Json::JsonSanitizer& sanitizer = staticSanitizer(); + FuzzedDataProvider provider(buf, len); + std::string buffer1, buffer2; + while (provider.remaining_bytes() != 0) { + std::string input = provider.ConsumeRandomLengthString(provider.remaining_bytes()); + absl::string_view hand_sanitized = sanitizer.sanitize(buffer1, input); + + // If the input is valid UTF-8 we can do a differential test against the + // Protobuf JSON sanitizer. Otherwise we are simply ensuring that the + // sanitizer does not crash. + if (Envoy::Json::isProtoSerializableUtf8(input)) { + buffer2 = + MessageUtil::getJsonStringFromMessageOrDie(ValueUtil::stringValue(input), false, true); + absl::string_view proto_sanitized = Envoy::Json::stripDoubleQuotes(buffer2); + if (hand_sanitized != proto_sanitized) { + std::cerr << "ERROR on input = "; + for (char c : input) { + if (c == '\\' || c == '"') { + std::cerr << "\\" << c; + } else if (c < ' ' || c > 126) { + std::cerr << "\\" << absl::StrFormat("%03o", static_cast(c)); + } else { + std::cerr << c; + } + } + std::cerr << std::endl; + } + FUZZ_ASSERT_EQ(hand_sanitized, proto_sanitized, input); + } + } +} + +} // namespace Fuzz +} // namespace Envoy diff --git a/test/common/json/json_sanitizer_speed_test.cc b/test/common/json/json_sanitizer_speed_test.cc new file mode 100644 index 0000000000000..873dee4f02844 --- /dev/null +++ b/test/common/json/json_sanitizer_speed_test.cc @@ -0,0 +1,96 @@ +#include "source/common/json/json_internal.h" +#include "source/common/json/json_sanitizer.h" +#include "source/common/protobuf/utility.h" + +#include "benchmark/benchmark.h" + +// NOLINT(namespace-envoy) + +constexpr absl::string_view pass_through_encoding = "Now is the time for all good men"; +constexpr absl::string_view escaped_encoding = "Now for all good men"; + +const Envoy::Json::JsonSanitizer& staticSanitizer() { + CONSTRUCT_ON_FIRST_USE(Envoy::Json::JsonSanitizer); +} + +// NOLINTNEXTLINE(readability-identifier-naming) +static void BM_ProtoEncoderNoEscape(benchmark::State& state) { + const std::string str = std::string(pass_through_encoding); + + for (auto _ : state) { // NOLINT + Envoy::MessageUtil::getJsonStringFromMessageOrDie(Envoy::ValueUtil::stringValue(str), false, + true); + } +} +BENCHMARK(BM_ProtoEncoderNoEscape); + +// NOLINTNEXTLINE(readability-identifier-naming) +static void BM_JsonSanitizerNoEscape(benchmark::State& state) { + std::string buffer; + Envoy::Json::JsonSanitizer sanitizer; + + for (auto _ : state) { // NOLINT + sanitizer.sanitize(buffer, pass_through_encoding); + } +} +BENCHMARK(BM_JsonSanitizerNoEscape); + +// NOLINTNEXTLINE(readability-identifier-naming) +static void BM_NlohmannNoEscape(benchmark::State& state) { + for (auto _ : state) { // NOLINT + Envoy::Json::Nlohmann::Factory::serialize(pass_through_encoding); + } +} +BENCHMARK(BM_NlohmannNoEscape); + +// NOLINTNEXTLINE(readability-identifier-naming) +static void BM_StaticJsonSanitizerNoEscape(benchmark::State& state) { + std::string buffer; + + for (auto _ : state) { // NOLINT + staticSanitizer().sanitize(buffer, pass_through_encoding); + } +} +BENCHMARK(BM_StaticJsonSanitizerNoEscape); + +// NOLINTNEXTLINE(readability-identifier-naming) +static void BM_ProtoEncoderWithEscape(benchmark::State& state) { + const std::string str = std::string(escaped_encoding); + + for (auto _ : state) { // NOLINT + Envoy::MessageUtil::getJsonStringFromMessageOrDie(Envoy::ValueUtil::stringValue(str), false, + true); + } +} +BENCHMARK(BM_ProtoEncoderWithEscape); + +// NOLINTNEXTLINE(readability-identifier-naming) +static void BM_NlohmannWithEscape(benchmark::State& state) { + const std::string str = std::string(escaped_encoding); + + for (auto _ : state) { // NOLINT + Envoy::Json::Nlohmann::Factory::serialize(str); + } +} +BENCHMARK(BM_NlohmannWithEscape); + +// NOLINTNEXTLINE(readability-identifier-naming) +static void BM_JsonSanitizerWithEscape(benchmark::State& state) { + Envoy::Json::JsonSanitizer sanitizer; + std::string buffer; + + for (auto _ : state) { // NOLINT + sanitizer.sanitize(buffer, escaped_encoding); + } +} +BENCHMARK(BM_JsonSanitizerWithEscape); + +// NOLINTNEXTLINE(readability-identifier-naming) +static void BM_StaticJsonSanitizerWithEscape(benchmark::State& state) { + std::string buffer; + + for (auto _ : state) { // NOLINT + staticSanitizer().sanitize(buffer, escaped_encoding); + } +} +BENCHMARK(BM_StaticJsonSanitizerWithEscape); diff --git a/test/common/json/json_sanitizer_test.cc b/test/common/json/json_sanitizer_test.cc new file mode 100644 index 0000000000000..162a60b964fed --- /dev/null +++ b/test/common/json/json_sanitizer_test.cc @@ -0,0 +1,365 @@ +#include + +#include "source/common/json/json_internal.h" +#include "source/common/json/json_sanitizer.h" +#include "source/common/protobuf/utility.h" + +#include "test/common/json/json_sanitizer_test_util.h" + +#include "absl/strings/str_format.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +using testing::StartsWith; + +namespace Envoy { +namespace Json { +namespace { + +constexpr absl::string_view Lambda{"λ"}; +constexpr absl::string_view LambdaUtf8{"\316\273"}; +constexpr absl::string_view Omicron{"ό"}; +constexpr absl::string_view OmicronUtf8{"\341\275\271"}; +constexpr absl::string_view TrebleClefUtf8{"\360\235\204\236"}; + +class JsonSanitizerTest : public testing::Test { +protected: + using UnicodeSizePair = JsonSanitizer::UnicodeSizePair; + + JsonSanitizerTest() { + if (::getenv("GENERATE_INVALID_UTF8_RANGES") != nullptr) { + generate_invalid_utf8_ranges_ = true; + static bool message_emitted = false; + if (!message_emitted) { + std::cout << "Runs full sweep of 3-byte and 4-byte utf8 to find unicodes that protobufs " + "cannot serialize, to collect them in ranges. The range initialization can " + "then be pasted into json_sanitizer_test_util.cc so that future fuzz tests " + "and unit tests can avoid doing differentials against protobuf ranges that " + "cannot be support. This likely needs to be re-run when the protobufs " + "dependency is updated. Be sure to run this piping the output through " + " |& grep -v 'contains invalid UTF-8' as the protobuf library will generate " + " that message thousands of times and there is no way to disable it." + << std::endl; + message_emitted = true; + } + } + } + + absl::string_view sanitizeAndCheckAgainstProtobufJson(absl::string_view str) { + EXPECT_TRUE(isProtoSerializableUtf8(str)) << "str=" << str; + absl::string_view hand_sanitized = sanitizer_.sanitize(buffer_, str); + if (isProtoSerializableUtf8(str)) { + std::string proto_sanitized = MessageUtil::getJsonStringFromMessageOrDie( + ValueUtil::stringValue(std::string(str)), false, true); + EXPECT_EQ(stripDoubleQuotes(proto_sanitized), hand_sanitized) << "str=" << str; + } + EXPECT_EQ(hand_sanitized, stripDoubleQuotes(Nlohmann::Factory::serialize(str))); + return hand_sanitized; + } + + void expectUnchanged(absl::string_view str) { + EXPECT_EQ(str, sanitizeAndCheckAgainstProtobufJson(str)); + } + + absl::string_view truncate(absl::string_view str) { return str.substr(0, str.size() - 1); } + + std::string corruptByte2(absl::string_view str) { + std::string corrupt_second_byte = std::string(str); + ASSERT(str.size() >= 2); + corrupt_second_byte[1] |= '\xf0'; + return corrupt_second_byte; + } + + absl::string_view sanitizeInvalid(absl::string_view str) { + EXPECT_EQ(UnicodeSizePair(0, 0), decode(str)); + return sanitizer_.sanitize(buffer_, str); + } + + std::pair decode(absl::string_view str) { + return JsonSanitizer::decodeUtf8(reinterpret_cast(str.data()), str.size()); + } + + JsonSanitizer sanitizer_; + std::string buffer_; + bool generate_invalid_utf8_ranges_{false}; +}; + +TEST_F(JsonSanitizerTest, Empty) { expectUnchanged(""); } + +TEST_F(JsonSanitizerTest, NoEscape) { + expectUnchanged("abcdefghijklmnopqrstuvwxyz"); + expectUnchanged("ABCDEFGHIJKLMNOPQRSTUVWXYZ"); + expectUnchanged("1234567890"); + expectUnchanged(" `~!@#$%^&*()_+-={}|[]"); + expectUnchanged("Hello world, Καλημέρα κόσμε, コンニチハ"); +} + +TEST_F(JsonSanitizerTest, SlashChars) { + EXPECT_EQ("\\b", sanitizeAndCheckAgainstProtobufJson("\b")); + EXPECT_EQ("\\f", sanitizeAndCheckAgainstProtobufJson("\f")); + EXPECT_EQ("\\n", sanitizeAndCheckAgainstProtobufJson("\n")); + EXPECT_EQ("\\r", sanitizeAndCheckAgainstProtobufJson("\r")); + EXPECT_EQ("\\t", sanitizeAndCheckAgainstProtobufJson("\t")); + EXPECT_EQ("\\\\", sanitizeAndCheckAgainstProtobufJson("\\")); + EXPECT_EQ("\\\"", sanitizeAndCheckAgainstProtobufJson("\"")); +} + +TEST_F(JsonSanitizerTest, ControlChars) { + EXPECT_EQ("\\u0001", sanitizeAndCheckAgainstProtobufJson("\001")); + EXPECT_EQ("\\u0002", sanitizeAndCheckAgainstProtobufJson("\002")); + EXPECT_EQ("\\b", sanitizeAndCheckAgainstProtobufJson("\010")); + EXPECT_EQ("\\t", sanitizeAndCheckAgainstProtobufJson("\011")); + EXPECT_EQ("\\n", sanitizeAndCheckAgainstProtobufJson("\012")); + EXPECT_EQ("\\u000b", sanitizeAndCheckAgainstProtobufJson("\013")); + EXPECT_EQ("\\f", sanitizeAndCheckAgainstProtobufJson("\014")); + EXPECT_EQ("\\r", sanitizeAndCheckAgainstProtobufJson("\015")); + EXPECT_EQ("\\u000e", sanitizeAndCheckAgainstProtobufJson("\016")); + EXPECT_EQ("\\u000f", sanitizeAndCheckAgainstProtobufJson("\017")); + EXPECT_EQ("\\u0010", sanitizeAndCheckAgainstProtobufJson("\020")); + EXPECT_EQ("\\u003c", sanitizeAndCheckAgainstProtobufJson("<")); + EXPECT_EQ("\\u003e", sanitizeAndCheckAgainstProtobufJson(">")); +} + +TEST_F(JsonSanitizerTest, SevenBitAscii) { + // Cover all the 7-bit ascii values, calling sanitize so that it checks + // our hand-rolled sanitizer vs protobuf. We ignore the return-value of + // sanitize(); we are just calling for it to test against protobuf. + for (uint32_t i = 0; i < 128; ++i) { + char c = i; + sanitizeAndCheckAgainstProtobufJson(absl::string_view(&c, 1)); + } +} + +TEST_F(JsonSanitizerTest, Utf8) { + // reference; https://www.charset.org/utf-8 + auto unicode = [](std::vector chars) -> std::string { + return std::string(reinterpret_cast(&chars[0]), chars.size()); + }; + + sanitizeAndCheckAgainstProtobufJson(unicode({0xc2, 0xa2})); // Cent. + sanitizeAndCheckAgainstProtobufJson(unicode({0xc2, 0xa9})); // Copyright. + sanitizeAndCheckAgainstProtobufJson(unicode({0xc3, 0xa0})); // 'a' with accent grave. +} + +TEST_F(JsonSanitizerTest, Interspersed) { + EXPECT_EQ("a\\bc", sanitizeAndCheckAgainstProtobufJson("a\bc")); + EXPECT_EQ("a\\b\\fc", sanitizeAndCheckAgainstProtobufJson("a\b\fc")); + EXPECT_EQ("\\bac", sanitizeAndCheckAgainstProtobufJson("\bac")); + EXPECT_EQ("\\b\\fac", sanitizeAndCheckAgainstProtobufJson("\b\fac")); + EXPECT_EQ("ac\\b", sanitizeAndCheckAgainstProtobufJson("ac\b")); + EXPECT_EQ("ac\\b", sanitizeAndCheckAgainstProtobufJson("ac\b")); + EXPECT_EQ("\\ra\\f", sanitizeAndCheckAgainstProtobufJson("\ra\f")); +} + +TEST_F(JsonSanitizerTest, AllTwoByteUtf8) { + char buf[2]; + absl::string_view utf8(buf, 2); + for (uint32_t byte1 = 2; byte1 < 32; ++byte1) { + buf[0] = byte1 | JsonSanitizer::Utf8_2BytePattern; + for (uint32_t byte2 = 0; byte2 < 64; ++byte2) { + buf[1] = byte2 | JsonSanitizer::Utf8_ContinuePattern; + auto [unicode, consumed] = + Envoy::Json::JsonSanitizer::decodeUtf8(reinterpret_cast(buf), 2); + ASSERT_EQ(2, consumed); + sanitizeAndCheckAgainstProtobufJson(utf8); + } + } +} + +TEST_F(JsonSanitizerTest, AllThreeByteUtf8) { + std::string utf8("abc"); + uint32_t num_excluded = 0, num_included = 0; + uint32_t num_matches = 0, num_mismatches = 0; + for (uint32_t byte1 = 0; byte1 < 16; ++byte1) { + utf8[0] = byte1 | JsonSanitizer::Utf8_3BytePattern; + for (uint32_t byte2 = 0; byte2 < 64; ++byte2) { + utf8[1] = byte2 | JsonSanitizer::Utf8_ContinuePattern; + for (uint32_t byte3 = 0; byte3 < 64; ++byte3) { + utf8[2] = byte3 | JsonSanitizer::Utf8_ContinuePattern; + absl::string_view hand_sanitized = sanitizer_.sanitize(buffer_, utf8); + if (isProtoSerializableUtf8(utf8)) { + ++num_included; + auto [unicode, consumed] = Envoy::Json::JsonSanitizer::decodeUtf8( + reinterpret_cast(utf8.data()), 3); + EXPECT_EQ(3, consumed); + std::string proto_sanitized = + MessageUtil::getJsonStringFromMessageOrDie(ValueUtil::stringValue(utf8), false, true); + EXPECT_TRUE(utf8Equivalent(stripDoubleQuotes(proto_sanitized), hand_sanitized)) + << "(" << byte1 << "," << byte2 << "," << byte3 << ")"; + if (utf8Equivalent(stripDoubleQuotes(proto_sanitized), hand_sanitized)) { + ++num_matches; + } else { + ENVOY_LOG_MISC(error, "unicode=0x{}, proto_sanitized={}", + absl::StrFormat("%x", unicode), proto_sanitized); + ++num_mismatches; + } + } else { + ++num_excluded; + } + } + } + } + EXPECT_EQ(61440, num_included); + EXPECT_EQ(4096, num_excluded); + EXPECT_EQ(16 * 64 * 64, num_included + num_excluded); + EXPECT_EQ(61440, num_matches); + EXPECT_EQ(0, num_mismatches); +} + +// This test takes 17 seconds without optimization. +//#ifdef NDEBUG +TEST_F(JsonSanitizerTest, AllFourByteUtf8) { + std::string utf8("abcd"); + uint32_t num_excluded = 0, num_included = 0; + uint32_t num_matches = 0, num_mismatches = 0; + + for (uint32_t byte1 = 0; byte1 < 16; ++byte1) { + utf8[0] = byte1 | JsonSanitizer::Utf8_4BytePattern; + for (uint32_t byte2 = 0; byte2 < 64; ++byte2) { + utf8[1] = byte2 | JsonSanitizer::Utf8_ContinuePattern; + for (uint32_t byte3 = 0; byte3 < 64; ++byte3) { + utf8[2] = byte3 | JsonSanitizer::Utf8_ContinuePattern; + for (uint32_t byte4 = 0; byte4 < 64; ++byte4) { + utf8[3] = byte4 | JsonSanitizer::Utf8_ContinuePattern; + absl::string_view hand_sanitized = sanitizer_.sanitize(buffer_, utf8); + if (isProtoSerializableUtf8(utf8)) { + ++num_included; + auto [unicode, consumed] = Envoy::Json::JsonSanitizer::decodeUtf8( + reinterpret_cast(utf8.data()), 4); + EXPECT_EQ(4, consumed); + std::string proto_sanitized = MessageUtil::getJsonStringFromMessageOrDie( + ValueUtil::stringValue(utf8), false, true); + EXPECT_TRUE(utf8Equivalent(stripDoubleQuotes(proto_sanitized), hand_sanitized)) + << "(" << byte1 << "," << byte2 << "," << byte3 << "," << byte4 << ")"; + if (utf8Equivalent(stripDoubleQuotes(proto_sanitized), hand_sanitized)) { + ++num_matches; + } else { + ENVOY_LOG_MISC(error, "unicode=0x{}, proto_sanitized={}", + absl::StrFormat("%x", unicode), proto_sanitized); + ++num_mismatches; + } + } else { + ++num_excluded; + } + } + } + } + } + /* + EXPECT_EQ(1048576, num_included); + EXPECT_EQ(3145728, num_excluded); + EXPECT_EQ(1048471, num_matches); + EXPECT_EQ(105, num_mismatches); + */ +} +//#endif + +TEST_F(JsonSanitizerTest, MultiByteUtf8) { + EXPECT_EQ(UnicodeSizePair(0x3bb, 2), decode(Lambda)); + EXPECT_EQ(UnicodeSizePair(0x3bb, 2), decode(LambdaUtf8)); + EXPECT_EQ(UnicodeSizePair(0x1f79, 3), decode(Omicron)); + EXPECT_EQ(UnicodeSizePair(0x1f79, 3), decode(OmicronUtf8)); + + // It's hard to find large unicode characters, but to test the utf8 decoder + // there are some in https://unicode-table.com/en/blocks/musical-symbols/ + // with reference utf8 encoding from https://unicode-table.com/en/1D11E/ + EXPECT_EQ(UnicodeSizePair(0x1d11e, 4), decode(TrebleClefUtf8)); +} + +TEST_F(JsonSanitizerTest, Low8Bit) { + // The characters from 0 to 0xBF (191) inclusive are all rendered identically + // to the protobuf json encoder. + std::string x0_7f; + for (uint32_t i = 0; i <= 0x7f; ++i) { + char ch = i; + x0_7f.push_back(ch); + } + EXPECT_EQ( + // Control-characters 0-31 + "\\u0000\\u0001\\u0002\\u0003\\u0004\\u0005\\u0006\\u0007\\b\\t\\n" + "\\u000b\\f\\r\\u000e\\u000f\\u0010\\u0011\\u0012\\u0013\\u0014\\u0015" + "\\u0016\\u0017\\u0018\\u0019\\u001a\\u001b\\u001c\\u001d\\u001e\\u001f" + + // Printable characters starting with space. Double-quote is back-slashed. + " !\\\"#$%&'()*+,-./0123456789:;" + + // < and > are serialized by json as unicode. + "\\u003c=\\u003e?@ABCDEFGHIJKLMNOPQRSTUVWXYZ" + + // Remaining 7-bit codes ending with 127, which is rendered as a unicode escape. + "[\\\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\\u007f", + + sanitizeAndCheckAgainstProtobufJson(x0_7f)); +} + +TEST_F(JsonSanitizerTest, High8Bit) { + std::string x80_ff; + for (uint32_t i = 0x80; i <= 0xff; ++i) { + char ch = i; + x80_ff.push_back(ch); + } + // The characters from 0x80 (192) to 255 all start out like they are + // multi-byte utf-8 sequences, but in this context are not followed by the + // right continuation pattern. The protobuf json serializer generates + // lots of error messages for these and yields empty strings, but we + // just escape them as single bytes. + EXPECT_EQ( + // The codes from 128-159 (0x9f) are rendered as several ways: unicode + // escapes or literal 8-bit characters. + "\\u0080\\u0081\\u0082\\u0083\\u0084\\u0085\\u0086\\u0087\\u0088\\u0089" + "\\u008a\\u008b\\u008c\\u008d\\u008e\\u008f\\u0090\\u0091\\u0092\\u0093" + "\\u0094\\u0095\\u0096\\u0097\\u0098\\u0099\\u009a\\u009b\\u009c\\u009d" + "\\u009e\\u009f" + + // Then a sequence of literal 8-bit characters. + "\xA0\xA1\xA2\xA3\xA4\xA5\xA6\xA7\xA8\xA9\xAA\xAB\xAC" + + // Weird special-case behavior to match json sanitizer + "\\u00ad" + + // More literal 8-bit characters. + "\xAE\xAF\xB0\xB1\xB2\xB3\xB4\xB5\xB6\xB7\xB8\xB9\xBA\xBB\xBC" + "\xBD\xBE\xBF" + + // Codes with a utf8 introductory byte pattern that lack the correct + // pattern for the remaining codes. These get OCT-escaped by the json + // sanitizer, whereas the protobuf serializer generates an error message + // and returns an empty string. + "\\300\\301\\302\\303\\304\\305\\306\\307\\310\\311\\312\\313\\314\\315\\316\\317" + "\\320\\321\\322\\323\\324\\325\\326\\327\\330\\331\\332\\333\\334\\335\\336\\337" + "\\340\\341\\342\\343\\344\\345\\346\\347\\350\\351\\352\\353\\354\\355\\356\\357" + "\\360\\361\\362\\363\\364\\365\\366\\367\\370\\371\\372\\373\\374\\375\\376\\377", + sanitizer_.sanitize(buffer_, x80_ff)); +} + +TEST_F(JsonSanitizerTest, InvalidUtf8) { + // 2 byte + EXPECT_EQ("\\316", sanitizeInvalid(truncate(LambdaUtf8))); + EXPECT_EQ("\\316\\373", sanitizeInvalid(corruptByte2(LambdaUtf8))); + + // 3 byte + absl::string_view out = sanitizeInvalid(truncate(OmicronUtf8)); + EXPECT_THAT(out, StartsWith("\\341")); + EXPECT_EQ(5, out.size()); + EXPECT_EQ('\275', out[4]); + EXPECT_EQ("\\341\\375\271", sanitizeInvalid(corruptByte2(OmicronUtf8))); + + // 4 byte + EXPECT_EQ("\\360\\u009d\\u0084", sanitizeInvalid(truncate(TrebleClefUtf8))); + EXPECT_EQ("\\360\\375\\u0084\\u009e", sanitizeInvalid(corruptByte2(TrebleClefUtf8))); + + // Invalid input embedded in normal text. + EXPECT_EQ( + "Hello, \\360\\u009d\\u0084, World!", + sanitizer_.sanitize(buffer_, absl::StrCat("Hello, ", truncate(TrebleClefUtf8), ", World!"))); + + // Replicate a few other cases that were discovered during initial fuzzing, + // to ensure we see these as invalid utf8 and avoid them in comparisons. + EXPECT_FALSE(isProtoSerializableUtf8("_K\301\234K")); + EXPECT_FALSE(isProtoSerializableUtf8("\xF7\xA6\x8A\x8A")); + EXPECT_FALSE(isProtoSerializableUtf8("\020\377\377\376\000")); +} + +} // namespace +} // namespace Json +} // namespace Envoy diff --git a/test/common/json/json_sanitizer_test_util.cc b/test/common/json/json_sanitizer_test_util.cc new file mode 100644 index 0000000000000..f746253ef9907 --- /dev/null +++ b/test/common/json/json_sanitizer_test_util.cc @@ -0,0 +1,185 @@ +#include "test/common/json/json_sanitizer_test_util.h" + +#include + +#include "source/common/common/utility.h" +#include "source/common/json/json_sanitizer.h" + +#include "absl/strings/match.h" +#include "absl/strings/numbers.h" + +namespace Envoy { +namespace Json { + +absl::string_view stripDoubleQuotes(absl::string_view str) { + if (str.size() >= 2 && str[0] == '"' && str[str.size() - 1] == '"') { + return str.substr(1, str.size() - 2); + } + return str; +} + +namespace { + +class InvalidUnicodeSet { +public: + InvalidUnicodeSet() { + // Generated with + // bazel build -c opt test/common/json:json_sanitizer_test + // GENERATE_INVALID_UTF8_RANGES=1 + // ./bazel-bin/test/common/json/json_sanitizer_test |& + // grep -v 'contains invalid UTF-8' + + // Avoid ranges where the protobuf serialization fails, returning + // an empty string. + invalid_3byte_intervals_.insert(0xd800, 0xe000); + + // Avoid unicode ranges generated from 4-byte utf-8 where protobuf + // serialization generates two small unicode values instead of the correct one. + // This must be a protobuf serialization issue. + invalid_4byte_intervals_.insert(0x1d173, 0x1d17b); + invalid_4byte_intervals_.insert(0xe0001, 0xe0002); + invalid_4byte_intervals_.insert(0xe0020, 0xe0080); + } + + // Helper functions to see if the specified unicode is in the 3-byte utf-8 + // exclusion set or the 4-byte utf-8 exclusion-set. + bool isInvalid3Byte(uint32_t unicode) const { return invalid_3byte_intervals_.test(unicode); } + bool isInvalid4Byte(uint32_t unicode) const { return invalid_4byte_intervals_.test(unicode); } + +private: + IntervalSetImpl invalid_3byte_intervals_; + IntervalSetImpl invalid_4byte_intervals_; +}; + +const InvalidUnicodeSet& invalidUnicodeSet() { CONSTRUCT_ON_FIRST_USE(InvalidUnicodeSet); } + +} // namespace + +bool isProtoSerializableUtf8(absl::string_view in) { + const uint8_t* data = reinterpret_cast(in.data()); + uint32_t size = in.size(); + while (size != 0) { + if ((*data & 0x80) == 0) { + ++data; + --size; + } else { + auto [unicode, consumed] = Envoy::Json::JsonSanitizer::decodeUtf8(data, size); + data += consumed; + size -= consumed; + + switch (consumed) { + case 2: + break; + case 3: + if (invalidUnicodeSet().isInvalid3Byte(unicode)) { + return false; + } + break; + case 4: + if (invalidUnicodeSet().isInvalid4Byte(unicode)) { + return false; + } + break; + default: + return false; + } + } + } + return true; +} + +// Implements strtol for hex, but accepting a non-nul-terminated string_view, +// and with one branch per character. This can be done with only one branch +// per string if we use a table instead of a switch statement, and have all +// the non-hex character inputs map to 0x80, and accumulate the OR of all +// mapped values to test after the loop, but that would be harder to read. +// +// It is good for this code to be somewhat faster (ie not create a temp string) +// so that fuzzers can run faster and cover more cases. +// +// If a string-view based hex decoder is useful in production code, this +// could be factored into a decode() variant in source/common/common.hex.cc. +bool parseUnicode(absl::string_view str, uint32_t& hex_value) { + if (absl::StartsWith(str, "\\u") && str.size() >= 6) { + hex_value = 0; + for (char c : str.substr(2, 4)) { + uint32_t val = 0; + switch (c) { + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + val = c - '0'; + break; + case 'A': + case 'B': + case 'C': + case 'D': + case 'E': + case 'F': + val = c - 'A' + 10; + break; + case 'a': + case 'b': + case 'c': + case 'd': + case 'e': + case 'f': + val = c - 'a' + 10; + break; + default: + return false; + } + hex_value = 16 * hex_value + val; + } + return true; + } + return false; +} + +// Compares a string that's possibly an escaped unicode, e.g. \u1234, to +// one that is utf8-encoded. +bool compareUnicodeEscapeAgainstUtf8(absl::string_view& escaped, absl::string_view& utf8) { + uint32_t escaped_unicode; + if (utf8.size() >= 3 && parseUnicode(escaped, escaped_unicode)) { + // If one side of the comparison is a unicode escape, + auto [unicode, consumed] = Envoy::Json::JsonSanitizer::decodeUtf8( + reinterpret_cast(utf8.data()), utf8.size()); + if (consumed == 3 && unicode == escaped_unicode) { + utf8 = utf8.substr(3, utf8.size() - 3); + escaped = escaped.substr(6, escaped.size() - 6); + return true; + } + } + return false; +} + +// Determines whether two strings differ only in whether they have +// literal utf-8 or escaped 3-byte unicode. We do this equivalence +// comparison to enable differential fuzzing between JsonSanitizer and +// protobuf json serialization. The protobuf implementation has made +// some hard-to-understand decisions about what to encode via unicode +// escapes versus what to pass through as utf-8. +bool utf8Equivalent(absl::string_view a, absl::string_view b) { + while (true) { + if (a.empty() && b.empty()) { + return true; + } else if (a.empty() || b.empty()) { + return false; + } else if (a[0] == b[0]) { + a = a.substr(1, a.size() - 1); + b = b.substr(1, b.size() - 1); + } else if (!compareUnicodeEscapeAgainstUtf8(a, b) && !compareUnicodeEscapeAgainstUtf8(b, a)) { + return false; + } + } +} + +} // namespace Json +} // namespace Envoy diff --git a/test/common/json/json_sanitizer_test_util.h b/test/common/json/json_sanitizer_test_util.h new file mode 100644 index 0000000000000..e978cc86598dd --- /dev/null +++ b/test/common/json/json_sanitizer_test_util.h @@ -0,0 +1,29 @@ +#pragma once + +#include "absl/strings/string_view.h" + +namespace Envoy { +namespace Json { + +/** + * Strips double-quotes on first and last characters of str. + * + * @param str The string to strip double-quotes from. + * @return The string without its surrounding double-quotes. + */ +absl::string_view stripDoubleQuotes(absl::string_view str); + +/** + * Determines whether the input string can be serialized by protobufs. This is + * used for testing, to avoid trying to do differentials against Protobuf json + * sanitization, which produces noisy error messages and empty strings when + * presented with some utf8 sequences that are valid according to spec. + * + * @param in the string to validate as utf-8. + */ +bool isProtoSerializableUtf8(absl::string_view in); + +bool utf8Equivalent(absl::string_view a, absl::string_view b); + +} // namespace Json +} // namespace Envoy diff --git a/test/fuzz/BUILD b/test/fuzz/BUILD index 115801409b376..9370f55376bf5 100644 --- a/test/fuzz/BUILD +++ b/test/fuzz/BUILD @@ -61,10 +61,12 @@ envoy_cc_test_library( envoy_cc_test_library( name = "utility_lib", + srcs = ["utility.cc"], hdrs = ["utility.h"], deps = [ ":common_proto_cc_proto", "//source/common/common:empty_string", + "//source/common/common:logger_lib", "//source/common/network:resolver_lib", "//source/common/network:utility_lib", "//test/common/stream_info:test_util", diff --git a/test/fuzz/utility.cc b/test/fuzz/utility.cc new file mode 100644 index 0000000000000..810007e000c07 --- /dev/null +++ b/test/fuzz/utility.cc @@ -0,0 +1,27 @@ +#include "test/fuzz/utility.h" + +#include "source/common/common/logger.h" + +#include "absl/strings/str_format.h" + +namespace Envoy { +namespace Fuzz { + +std::vector fuzzFindDiffs(absl::string_view expected, absl::string_view actual) { + std::vector diffs; + const uint32_t max_diffs = 5; + if (expected.size() != actual.size()) { + diffs.push_back(absl::StrCat("Size mismatch: ", expected.size(), " != ", actual.size())); + } + uint32_t min_size = std::min(expected.size(), actual.size()); + for (uint32_t i = 0; i < min_size && diffs.size() < max_diffs; ++i) { + if (expected[i] != actual[i]) { + diffs.push_back(absl::StrFormat("[%d]: %c(%u) != %c(%u)", i, expected[i], expected[i], + actual[i], actual[i])); + } + } + return diffs; +} + +} // namespace Fuzz +} // namespace Envoy diff --git a/test/fuzz/utility.h b/test/fuzz/utility.h index 55b4772e15639..7593f24f11477 100644 --- a/test/fuzz/utility.h +++ b/test/fuzz/utility.h @@ -195,5 +195,16 @@ inline std::vector parseHttpData(const test::fuzz::HttpData& data) return data_chunks; } +// Returns a vector of differences between expected and actual. An empty array indicates +// expected==actual +std::vector fuzzFindDiffs(absl::string_view expected, absl::string_view actual); + +#define FUZZ_ASSERT_EQ(expected, actual, annotation) \ + { \ + std::vector diffs = fuzzFindDiffs(expected, actual); \ + RELEASE_ASSERT(expected == actual, absl::StrCat(annotation, ": ", expected, " != ", actual, \ + "\n ", absl::StrJoin(diffs, "\n "))); \ + } + } // namespace Fuzz } // namespace Envoy diff --git a/tools/spelling/spelling_dictionary.txt b/tools/spelling/spelling_dictionary.txt index 9a2d46e4841c5..7d0c29bf0432f 100644 --- a/tools/spelling/spelling_dictionary.txt +++ b/tools/spelling/spelling_dictionary.txt @@ -135,6 +135,7 @@ FREEBIND FUZZER FUZZERS dereferencing +differentially dnsresolvers guarddog GC @@ -1103,6 +1104,7 @@ rver rxhash sandboxed sanitization +sanitizations sanitizer satisfiable scalability