envoyproxy · jmarantz · Mar 20, 2022 · Mar 20, 2022 · Mar 20, 2022 · Mar 20, 2022
diff --git a/envoy/common/interval_set.h b/envoy/common/interval_set.h
@@ -37,6 +37,14 @@ template <typename Value> class IntervalSet {
    * Clears the contents of the interval set.
    */
   virtual void clear() PURE;
+
+  /**
+   * Determines whether the specified Value is in any of the intervals.
+   *
+   * @param value the value
+   * @return true if value is covered in the inteval set.
+   */
+  virtual bool test(Value value) const PURE;
 };
 
 } // namespace Envoy
diff --git a/source/common/common/utility.h b/source/common/common/utility.h
@@ -561,6 +561,11 @@ template <typename Value> class IntervalSetImpl : public IntervalSet<Value> {
     intervals_.insert(Interval(left, right));
   }
 
+  bool test(Value value) const override {
+    const auto left_pos = intervals_.lower_bound(Interval(value, value + 1));
+    return left_pos != intervals_.end() && value >= left_pos->first && value < left_pos->second;
+  }
+
   std::vector<Interval> toVector() const override {
     return std::vector<Interval>(intervals_.begin(), intervals_.end());
   }

diff --git a/source/common/json/BUILD b/source/common/json/BUILD
@@ -34,3 +34,10 @@ envoy_cc_library(
         "//source/common/runtime:runtime_features_lib",
     ],
 )
+
+envoy_cc_library(
+    name = "json_sanitizer_lib",
+    srcs = ["json_sanitizer.cc"],
+    hdrs = ["json_sanitizer.h"],
+    deps = ["//source/common/common:assert_lib"],
+)
diff --git a/source/common/json/json_sanitizer.cc b/source/common/json/json_sanitizer.cc
@@ -0,0 +1,239 @@
+#include "source/common/json/json_sanitizer.h"
+
+#include <utility>
+
+#include "source/common/common/assert.h"
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+
+namespace Envoy {
+namespace Json {
+
+namespace {
+
+const uint8_t Literal = 0;
+const uint8_t ControlEscapeSize = 2; // e.g. \b
+const uint8_t UnicodeEscapeSize = 6; // e.g. \u1234
+const uint8_t Utf8DecodeSentinel = 0xff;
+
+} // namespace
+
+JsonSanitizer::JsonSanitizer() {
+  // Single-char escape sequences for common control characters.
+  auto symbolic_escape = [this](char control_char, char symbolic) {
+    Escape& escape = char_escapes_[char2uint32(control_char)];
+    escape.size_ = ControlEscapeSize;
+    escape.chars_[0] = '\\';
+    escape.chars_[1] = symbolic;
+  };
+  symbolic_escape('\b', 'b');
+  symbolic_escape('\f', 'f');
+  symbolic_escape('\n', 'n');
+  symbolic_escape('\r', 'r');
+  symbolic_escape('\t', 't');
+  symbolic_escape('\\', '\\');
+  symbolic_escape('"', '"');
+
+  // Low characters (0-31) not listed above are encoded as unicode 4-digit hex.
+  auto unicode_escape = [this](uint32_t index) {
+    // We capture unicode Escapes both in a char-indexed array, for direct
+    // substitutions on literal inputs, and in a unicode-indexed hash-map,
+    // for lookup after utf8 decode.
+    std::string escape_str = absl::StrFormat("\\u%04x", index);
+    ASSERT(escape_str.size() == UnicodeEscapeSize);
+    Escape& escape = unicode_escapes_[index];
+    escape.size_ = escape_str.size();
+    RELEASE_ASSERT(escape.size_ <= sizeof(escape.chars_), "escaped string too large");
+    memcpy(escape.chars_, escape_str.data(), escape_str.size()); // NOLINT(safe-memcpy)*/
+    if (index < NumEscapes) {
+      char_escapes_[index] = escape;
+    }
+  };
+
+  // Add unicode escapes for control-characters below 32 that don't have symbolic escapes.
+  for (uint32_t i = 0; i < ' '; ++i) {
+    if (char_escapes_[i].size_ == 0) {
+      unicode_escape(i);
+    }
+  }
+
+  // Unicode-escaped ascii constants above SPACE (32).
+  for (char ch : {'<', '>', '\177'}) {
+    unicode_escape(char2uint32(ch));
+  }
+
+  // There's a range of 8-bit characters that are unicode escaped by the
+  // protobuf library, so we match behavior.
+  for (uint32_t i = 0x0080; i < 0x00a0; ++i) {
+    unicode_escape(i);
+  }
+
+  // The remaining unicode characters are mostly passed through literally. We'll
+  // initialize all of them and then override some below.
+  for (uint32_t i = 0x00a0; i < NumEscapes; ++i) {
+    char_escapes_[i].size_ = Literal;
+  }
+
+  // All the bytes matching pattern 11xxxxxx will be evaluated as utf-8.
+  for (uint32_t i = Utf8_2BytePattern; i <= 0xff; ++i) {
+    char_escapes_[i].size_ = Utf8DecodeSentinel;
+  }
+
+  // There are an assortment of unicode characters that protobufs quote, so we
+  // do likewise here to make differential testing/fuzzing feasible.
+  for (uint32_t i : {0x00ad, 0x0600, 0x0601, 0x0602, 0x0603, 0x06dd, 0x070f}) {
+    unicode_escape(i);
+  }
+}
+
+absl::string_view JsonSanitizer::sanitize(std::string& buffer, absl::string_view str) const {
+  // Fast-path to see whether any escapes or utf-encoding are needed. If str has
+  // only unescaped ascii characters, we can simply return it. So before doing
+  // anything too fancy, do a lookup in char_escapes_ for each character, and
+  // simply OR in the return sizes. We use 0 for the return-size when we are
+  // simply leaving the character as is, so anything non-zero means we need to
+  // initiate the slow path.
+  //
+  // Benchmarks show it's faster to just rip through the string with no
+  // conditionals, so we only check the ORed sizes after the loop. This avoids
+  // branches and allows simpler loop unrolling by the compiler.
+  uint32_t sizes_ored_together = 0;
+  for (char c : str) {
+    sizes_ored_together |= char_escapes_[char2uint32(c)].size_;
+  }
+  if (sizes_ored_together == 0) {
+    return str; // Happy path, should be executed most of the time.
+  }
+  return slowSanitize(buffer, str);
+}
+
+absl::string_view JsonSanitizer::slowSanitize(std::string& buffer, absl::string_view str) const {
+  std::string oct_escape_buf;
+  size_t past_escape = absl::string_view::npos;
+  const uint8_t* first = reinterpret_cast<const uint8_t*>(str.data());
+  const uint8_t* data = first;
+  absl::string_view escape_view;
+  for (uint32_t n = str.size(); n != 0; ++data, --n) {
+    const Escape& escape = char_escapes_[*data];
+    if (escape.size_ != Literal) {
+      uint32_t start_of_escape = data - first;
+      switch (escape.size_) {
+      case ControlEscapeSize:
+      case UnicodeEscapeSize:
+        escape_view = absl::string_view(escape.chars_, escape.size_);
+        break;
+      case Utf8DecodeSentinel: {
+        auto [unicode, consumed] = decodeUtf8(data, n);
+        if (consumed != 0) {
+          --consumed;
+          data += consumed;
+          n -= consumed;
+
+          // Having validated and constructed the unicode for the utf-8
+          // sequence we must determine whether to render it literally by
+          // simply leaving it alone, or whether we ought to render it
+          // as a unicode escape. We do this using a hash-map set up during
+          // the constructor with all desired unicode escapes, to mimic the
+          // behavior of the protobuf json serializer.
+          auto iter = unicode_escapes_.find(unicode);
+          if (iter == unicode_escapes_.end()) {
+            continue;
+          }
+          escape_view = absl::string_view(iter->second.chars_, iter->second.size_);
+        } else {
+          // Using StrFormat during decode seems slow, but this case should be
+          // rare.
+          oct_escape_buf = absl::StrFormat("\\%03o", *data);
+          escape_view = absl::string_view(oct_escape_buf);
+        }
+        break;
+      }
+      default:
+        ASSERT(false);
+      }
+
+      if (past_escape == absl::string_view::npos) {
+        // We only initialize buffer when we first learn we need to add an
+        // escape-sequence to the sanitized string.
+        if (start_of_escape == 0) {
+          // The first character is an escape, and 'buffer' has not been cleared yet,
+          // so we need to assign it rather than append to it.
+          buffer.assign(escape_view.data(), escape_view.size());
+        } else {
+          // We found our first escape, but this is not the first character in the
+          // string, so we combine the unescaped characters in the string we already
+          // looped over with the new escaped character.
+          buffer = absl::StrCat(str.substr(0, start_of_escape), escape_view);
+        }
+      } else if (start_of_escape == past_escape) {
+        // We are adding an escape immediately after another escaped character.
+        absl::StrAppend(&buffer, escape_view);
+      } else {
+        // We are adding a new escape but must first cover the characters
+        // encountered since the previous escape.
+        absl::StrAppend(&buffer, str.substr(past_escape, start_of_escape - past_escape),
+                        escape_view);
+      }
+      past_escape = data - first + 1;
+    }
+  }
-  size_t past_escape = absl::string_view::npos;
-  for (uint32_t i = 0, n = str.size(); i < n; ++i) {
-    uint32_t index = static_cast<uint32_t>(static_cast<uint8_t>(str[i]));
-    const std::string* escape = char_escapes_[index].get();
-    if (escape != nullptr) {
-      if (past_escape == absl::string_view::npos) {
-        // We only initialize buffer when we first learn we need to add an
-        // escape-sequence to the sanitized string.
-        if (i == 0) {
-          buffer = *escape;
-        } else {
-          buffer = absl::StrCat(str.substr(0, i), *escape);
-        }
-      } else if (i == past_escape) {
-        absl::StrAppend(&buffer, *escape);
-      } else {
-        absl::StrAppend(&buffer, str.substr(past_escape, i - past_escape), *escape);
-      }
-      past_escape = i + 1;
-    }
-  }
+  size_t past_escape = 0;
+  for (uint32_t i = 0, n = str.size(); i < n; ++i) {
+    const std::string* escape = char_escapes_[static_cast<uint8_t>(str[i])].get();
+    if (escape != nullptr) {
+      absl::StrAppend(&buffer, str.substr(past_escape, i - past_escape), *escape);
+      past_escape = i + 1;
+    }
+  }
-  size_t past_escape = absl::string_view::npos;
-  for (uint32_t i = 0, n = str.size(); i < n; ++i) {
-    uint32_t index = static_cast<uint32_t>(static_cast<uint8_t>(str[i]));
-    const std::string* escape = char_escapes_[index].get();
-    if (escape != nullptr) {
-      if (past_escape == absl::string_view::npos) {
-        // We only initialize buffer when we first learn we need to add an
-        // escape-sequence to the sanitized string.
-        if (i == 0) {
-          buffer = *escape;
-        } else {
-          buffer = absl::StrCat(str.substr(0, i), *escape);
-        }
-      } else if (i == past_escape) {
-        absl::StrAppend(&buffer, *escape);
-      } else {
-        absl::StrAppend(&buffer, str.substr(past_escape, i - past_escape), *escape);
-      }
-      past_escape = i + 1;
-    }
-  }
+  size_t past_escape = 0;
+  for (uint32_t i = 0, n = str.size(); i < n; ++i) {
+    const std::string* escape = char_escapes_[static_cast<uint8_t>(str[i])].get();
+    if (escape != nullptr) {
+      absl::StrAppend(&buffer, str.substr(past_escape, i - past_escape), *escape);
+      past_escape = i + 1;
+    }
+  }
+
+  // If no escape-sequence was needed, we just return the input.
+  if (past_escape == absl::string_view::npos) {
+    return str;
+  }
+
+  // Otherwise we append on any unescaped chunk at the end of the input, and
+  // return buffer as the result.
+  if (past_escape < str.size()) {
+    absl::StrAppend(&buffer, str.substr(past_escape, str.size() - past_escape));
+  }
+  return buffer;
+}
+
+std::pair<uint32_t, uint32_t> JsonSanitizer::decodeUtf8(const uint8_t* bytes, uint32_t size) {
+  uint32_t unicode = 0;
+  uint32_t consumed = 0;
+
+  // See table in https://en.wikipedia.org/wiki/UTF-8, "Encoding" section.
+  //
+  // See also https://en.cppreference.com/w/cpp/locale/codecvt_utf8 which is
+  // marked as deprecated. There is also support in Windows libraries and Boost,
+  // which can be discovered on StackOverflow. I could not find a usable OSS
+  // implementation. However it's easily derived from the spec on Wikipedia.
+  //
+  // Note that the code below could be optimized a bit, e.g. by factoring out
+  // repeated lookups of the same index in the bytes array and using SSE
+  // instructions for the multi-word bit hacking.
+  //
+  // See also http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ which might be a lot
+  // faster, though less readable. As coded, though, it looks like it would read
+  // past the end of the input if the input is malformed.
+  if (size >= 2 && (bytes[0] & Utf8_2ByteMask) == Utf8_2BytePattern &&
+      (bytes[1] & Utf8_ContinueMask) == Utf8_ContinuePattern) {
+    unicode = bytes[0] & ~Utf8_2ByteMask;
+    unicode = (unicode << Utf8_Shift) | (bytes[1] & ~Utf8_ContinueMask);
+    consumed = 2;
+  } else if (size >= 3 && (bytes[0] & Utf8_3ByteMask) == Utf8_3BytePattern &&
+             (bytes[1] & Utf8_ContinueMask) == Utf8_ContinuePattern &&
+             (bytes[2] & Utf8_ContinueMask) == Utf8_ContinuePattern) {
+    unicode = bytes[0] & ~Utf8_3ByteMask;
+    unicode = (unicode << Utf8_Shift) | (bytes[1] & ~Utf8_ContinueMask);
+    unicode = (unicode << Utf8_Shift) | (bytes[2] & ~Utf8_ContinueMask);
+    consumed = 3;
+  } else if (size >= 4 && (bytes[0] & Utf8_4ByteMask) == Utf8_4BytePattern &&
+             (bytes[1] & Utf8_ContinueMask) == Utf8_ContinuePattern &&
+             (bytes[2] & Utf8_ContinueMask) == Utf8_ContinuePattern &&
+             (bytes[3] & Utf8_ContinueMask) == Utf8_ContinuePattern) {
+    unicode = bytes[0] & ~Utf8_4ByteMask;
+    unicode = (unicode << Utf8_Shift) | (bytes[1] & ~Utf8_ContinueMask);
+    unicode = (unicode << Utf8_Shift) | (bytes[2] & ~Utf8_ContinueMask);
+    unicode = (unicode << Utf8_Shift) | (bytes[3] & ~Utf8_ContinueMask);
+    consumed = 4;
+  }
+  return UnicodeSizePair(unicode, consumed);
+}
+
+} // namespace Json
+} // namespace Envoy
diff --git a/source/common/json/json_sanitizer.h b/source/common/json/json_sanitizer.h
@@ -0,0 +1,93 @@
+#pragma once
+
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
+
+namespace Envoy {
+namespace Json {
+
+// Hand-rolled JSON sanitizer that has exactly the same behavior as serializing
+// through protobufs, but is more than 10x faster. From
+// test/common/json/json_sanitizer_speed_test.cc:
+//
+// ---------------------------------------------------------------------------
+// Benchmark                                 Time             CPU   Iterations
+// ---------------------------------------------------------------------------
+// BM_ProtoEncoderNoEscape                1123 ns         1123 ns       545345
+// BM_JsonSanitizerNoEscape               8.77 ns         8.77 ns     79517538
+// BM_StaticJsonSanitizerNoEscape         9.52 ns         9.52 ns     73570603
+// BM_ProtoEncoderWithEscape              1326 ns         1326 ns       528576
+// BM_JsonSanitizerWithEscape             96.3 ns         96.3 ns      7289627
+// BM_StaticJsonSanitizerWithEscape       97.5 ns         97.5 ns      7157098
+//
+class JsonSanitizer {
+public:
+  static constexpr uint32_t Utf8_2ByteMask = 0b11100000;
+  static constexpr uint32_t Utf8_3ByteMask = 0b11110000;
+  static constexpr uint32_t Utf8_4ByteMask = 0b11111000;
+
+  static constexpr uint32_t Utf8_2BytePattern = 0b11000000;
+  static constexpr uint32_t Utf8_3BytePattern = 0b11100000;
+  static constexpr uint32_t Utf8_4BytePattern = 0b11110000;
+
+  static constexpr uint32_t Utf8_ContinueMask = 0b11000000;
+  static constexpr uint32_t Utf8_ContinuePattern = 0b10000000;
+
+  static constexpr uint32_t Utf8_Shift = 6;
+
+  // Constructing the sanitizer fills in a table with all escape-sequences,
+  // indexed by character. To make this perform well, you should instantiate the
+  // sanitizer in a context that lives across a large number of sanitizations.
+  JsonSanitizer();
+
+  /**
+   * Sanitizes a string so it is suitable for JSON. The buffer is
+   * used if any of the characters in str need to be escaped.
+   *
+   * @param buffer a string in which an escaped string can be written, if needed. It
+   *   is not necessary for callers to clear the buffer first; it be cleared
+   *   by this method if the input needs to be escaped.
+   * @param str the string to be translated
+   * @return the translated string_view.
+   */
+  absl::string_view sanitize(std::string& buffer, absl::string_view str) const;
+
+  /** The Unicode code-point and the number of utf8-bytes consumed */
+  using UnicodeSizePair = std::pair<uint32_t, uint32_t>;
+
+  /**
+   * Decodes a byte-stream of UTF8, returning the resulting unicode and the
+   * number of bytes consumed as a pair.
+   *
+   * @param bytes The data with utf8 bytes.
+   * @param size The number of bytes available in data
+   * @return UnicodeSizePair(unicode, consumed) -- if the decode fails consumed will be 0.
+   */
+  static UnicodeSizePair decodeUtf8(const uint8_t* bytes, uint32_t size);
+
+private:
+  // static constexpr uint32_t NumEscapes = 1 << 11; // 2^11=2048 codes possible in 2-byte utf8.
+  static constexpr uint32_t NumEscapes = 256;
+
+  // Character-indexed array of translation strings. If an entry is nullptr then
+  // the character does not require substitution. This strategy is dependent on
+  // the property of UTF-8 where all two-byte characters have the high-order bit
+  // set for both bytes, and don't require escaping for JSON. Thus we can
+  // consider each character in isolation for escaping. Reference:
+  // https://en.wikipedia.org/wiki/UTF-8.
+  struct Escape {
+    uint8_t size_{0};
+    char chars_[7]; // No need to initialize char data, as we are not null-terminating.
+  };
+
+  static uint32_t char2uint32(char c) { return static_cast<uint32_t>(static_cast<uint8_t>(c)); }
+  absl::string_view slowSanitize(std::string& buffer, absl::string_view str) const;
+
+  Escape char_escapes_[NumEscapes];
+  absl::flat_hash_map<uint32_t, Escape> unicode_escapes_;
+};
+
+} // namespace Json
+} // namespace Envoy
diff --git a/test/common/common/utility_test.cc b/test/common/common/utility_test.cc
@@ -931,6 +931,21 @@ TEST(IntervalSet, testIntervalTargeted) {
   EXPECT_EQ("[15, 20), [25, 30), [35, 40), [41, 43)", test(41, 43));
 }
 
+TEST(IntervalSet, testTest) {
+  IntervalSetImpl<uint32_t> set;
+  set.insert(4, 6);
+  EXPECT_FALSE(set.test(0));
+  set.insert(0, 2);
+  EXPECT_TRUE(set.test(0));
+  EXPECT_TRUE(set.test(1));
+  EXPECT_FALSE(set.test(2));
+  EXPECT_FALSE(set.test(3));
+  EXPECT_TRUE(set.test(4));
+  EXPECT_TRUE(set.test(5));
+  EXPECT_FALSE(set.test(6));
+  EXPECT_FALSE(set.test(7));
+}
+
 TEST(WelfordStandardDeviation, AllEntriesTheSame) {
   WelfordStandardDeviation wsd;
   wsd.update(10);