envoyproxy · jmarantz · Mar 20, 2022 · Mar 20, 2022 · Mar 20, 2022 · Mar 20, 2022
diff --git a/source/common/json/BUILD b/source/common/json/BUILD
@@ -34,3 +34,15 @@ envoy_cc_library(
         "//source/common/runtime:runtime_features_lib",
     ],
 )
+
+envoy_cc_library(
+    name = "json_sanitizer_lib",
+    srcs = ["json_sanitizer.cc"],
+    hdrs = ["json_sanitizer.h"],
+    external_deps = [
+        "json",
+    ],
+    deps = [
+        "//source/common/common:assert_lib",
+    ],
+)
diff --git a/source/common/json/json_sanitizer.cc b/source/common/json/json_sanitizer.cc
@@ -0,0 +1,94 @@
+#include "source/common/json/json_sanitizer.h"
+
+#include "source/common/common/assert.h"
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+
+namespace Envoy {
+namespace Json {
+
+JsonSanitizer::JsonSanitizer() {
+  // Single-char escape sequences for common control characters.
+  auto symbolic_escape = [this](char escape_char, char symbolic) {
+    Escape& escape = char_escapes_[static_cast<uint32_t>(escape_char)];
+    escape.size_ = 2;
+    escape.chars_[0] = '\\';
+    escape.chars_[1] = symbolic;
+  };
+  symbolic_escape('\b', 'b');
+  symbolic_escape('\f', 'f');
+  symbolic_escape('\n', 'n');
+  symbolic_escape('\r', 'r');
+  symbolic_escape('\t', 't');
+  symbolic_escape('\\', '\\');
+  symbolic_escape('"', '"');
+
+  // Low characters (0-31) not listed above are encoded as unicode 4-digit hex, plus
+  // a few other specific ones.
+  auto unicode_escape = [this](uint32_t index) {
+    Escape& escape = char_escapes_[static_cast<uint32_t>(index)];
+    if (escape.size_ == 0) {
+      std::string escape_str = absl::StrFormat("\\u%04x", index);
+      escape.size_ = escape_str.size();
+      RELEASE_ASSERT(escape.size_ <= sizeof(escape.chars_), "escaped string too large");
+      memcpy(escape.chars_, escape_str.data(), escape_str.size()); // NOLINT(safe-memcpy)
+    }
+  };
+  unicode_escape('<');
+  unicode_escape('>');
+  unicode_escape(127);
+
+  // Control-characters below 32.
+  for (uint32_t i = 0; i < ' '; ++i) {
+    unicode_escape(i);
+  }
+}
+
+absl::string_view JsonSanitizer::sanitize(std::string& buffer, absl::string_view str) const {
+  size_t past_escape = absl::string_view::npos;
+  for (uint32_t i = 0, n = str.size(); i < n; ++i) {
+    uint32_t index = static_cast<uint32_t>(static_cast<uint8_t>(str[i]));
+    const Escape& escape = char_escapes_[index];
+    if (escape.size_ != 0) {
+      absl::string_view escape_view(escape.chars_, escape.size_);
+      if (past_escape == absl::string_view::npos) {
+        // We only initialize buffer when we first learn we need to add an
+        // escape-sequence to the sanitized string.
+        if (i == 0) {
+          // The first character is an escape, and 'buffer' has not been cleared yet,
+          // so we need to assign it rather than append to it.
+          buffer.assign(escape_view.data(), escape_view.size());
+        } else {
+          // We found our first escape, but this is not the first character in the
+          // string, so we combine the unescaped characters in the string we already
+          // looped over with the new escaped character.
+          buffer = absl::StrCat(str.substr(0, i), escape_view);
+        }
+      } else if (i == past_escape) {
+        // We are adding an escape immediately after another escaped character.
+        absl::StrAppend(&buffer, escape_view);
+      } else {
+        // We are adding a new escape but must first cover the characters
+        // encountered since the previous escape.
+        absl::StrAppend(&buffer, str.substr(past_escape, i - past_escape), escape_view);
+      }
+      past_escape = i + 1;
+    }
+  }
-  size_t past_escape = absl::string_view::npos;
-  for (uint32_t i = 0, n = str.size(); i < n; ++i) {
-    uint32_t index = static_cast<uint32_t>(static_cast<uint8_t>(str[i]));
-    const std::string* escape = char_escapes_[index].get();
-    if (escape != nullptr) {
-      if (past_escape == absl::string_view::npos) {
-        // We only initialize buffer when we first learn we need to add an
-        // escape-sequence to the sanitized string.
-        if (i == 0) {
-          buffer = *escape;
-        } else {
-          buffer = absl::StrCat(str.substr(0, i), *escape);
-        }
-      } else if (i == past_escape) {
-        absl::StrAppend(&buffer, *escape);
-      } else {
-        absl::StrAppend(&buffer, str.substr(past_escape, i - past_escape), *escape);
-      }
-      past_escape = i + 1;
-    }
-  }
+  size_t past_escape = 0;
+  for (uint32_t i = 0, n = str.size(); i < n; ++i) {
+    const std::string* escape = char_escapes_[static_cast<uint8_t>(str[i])].get();
+    if (escape != nullptr) {
+      absl::StrAppend(&buffer, str.substr(past_escape, i - past_escape), *escape);
+      past_escape = i + 1;
+    }
+  }
-  size_t past_escape = absl::string_view::npos;
-  for (uint32_t i = 0, n = str.size(); i < n; ++i) {
-    uint32_t index = static_cast<uint32_t>(static_cast<uint8_t>(str[i]));
-    const std::string* escape = char_escapes_[index].get();
-    if (escape != nullptr) {
-      if (past_escape == absl::string_view::npos) {
-        // We only initialize buffer when we first learn we need to add an
-        // escape-sequence to the sanitized string.
-        if (i == 0) {
-          buffer = *escape;
-        } else {
-          buffer = absl::StrCat(str.substr(0, i), *escape);
-        }
-      } else if (i == past_escape) {
-        absl::StrAppend(&buffer, *escape);
-      } else {
-        absl::StrAppend(&buffer, str.substr(past_escape, i - past_escape), *escape);
-      }
-      past_escape = i + 1;
-    }
-  }
+  size_t past_escape = 0;
+  for (uint32_t i = 0, n = str.size(); i < n; ++i) {
+    const std::string* escape = char_escapes_[static_cast<uint8_t>(str[i])].get();
+    if (escape != nullptr) {
+      absl::StrAppend(&buffer, str.substr(past_escape, i - past_escape), *escape);
+      past_escape = i + 1;
+    }
+  }
+
+  // If no escape-sequence was needed, we just return the input.
+  if (past_escape == absl::string_view::npos) {
+    return str;
+  }
+
+  // Otherwise we append on any unescaped chunk at the end of the input, and
+  // return buffer as the result.
+  if (past_escape < str.size()) {
+    absl::StrAppend(&buffer, str.substr(past_escape, str.size() - past_escape));
+  }
+  return buffer;
+}
+
+} // namespace Json
+} // namespace Envoy
diff --git a/source/common/json/json_sanitizer.h b/source/common/json/json_sanitizer.h
@@ -0,0 +1,56 @@
+#pragma once
+
+#include <string>
+
+#include "absl/strings/string_view.h"
+
+namespace Envoy {
+namespace Json {
+
+// Hand-rolled JSON sanitizer that has exactly the same behavior as serializing
+// through protobufs, but is more than 10x faster. From
+// test/common/json/json_sanitizer_speed_test.cc:
+//
+// ---------------------------------------------------------------------
+// Benchmark                           Time             CPU   Iterations
+// ---------------------------------------------------------------------
+// BM_ProtoEncoderNoEscape          1102 ns         1102 ns       542926
+// BM_JsonSanitizerNoEscape         12.0 ns         12.0 ns     57347682
+// BM_ProtoEncoderWithEscape        1377 ns         1377 ns       507363
+// BM_JsonSanitizerWithEscape       77.3 ns         77.3 ns      8729626
+//
+class JsonSanitizer {
+public:
+  // Constructing the sanitizer fills in a table with all escape-sequences,
+  // indexed by character. To make this perform well, you should instantiate the
+  // sanitizer in a context that lives across a large number of sanitizations.
+  JsonSanitizer();
+
+  /**
+   * Sanitizes a string so it is suitable for JSON. The buffer is
+   * used if any of the characters in str need to be escaped.
+   *
+   * @param buffer a string in which an escaped string can be written, if needed. It
+   *   is not necessary for callers to clear the buffer first; it be cleared
+   *   by this method if the input needs to be escaped.
+   * @param str the string to be translated
+   * @return the translated string_view.
+   */
+  absl::string_view sanitize(std::string& buffer, absl::string_view str) const;
+
+private:
+  // Character-indexed array of translation strings. If an entry is nullptr then
+  // the character does not require substitution. This strategy is dependent on
+  // the property of UTF-8 where all two-byte characters have the high-order bit
+  // set for both bytes, and don't require escaping for JSON. Thus we can
+  // consider each character in isolation for escaping. Reference:
+  // https://en.wikipedia.org/wiki/UTF-8.
+  struct Escape {
+    uint8_t size_{0};
+    char chars_[7]; // No need to initialize char data, as we are not null-terminating.
+  };
+  Escape char_escapes_[256];
+};
+
+} // namespace Json
+} // namespace Envoy
diff --git a/test/common/json/BUILD b/test/common/json/BUILD
@@ -1,5 +1,6 @@
 load(
     "//bazel:envoy_build_system.bzl",
+    "envoy_cc_benchmark_binary",
     "envoy_cc_fuzz_test",
     "envoy_cc_test",
     "envoy_package",
@@ -36,3 +37,21 @@ envoy_cc_test(
         "//test/test_common:utility_lib",
     ],
 )
+
+envoy_cc_test(
+    name = "json_sanitizer_test",
+    srcs = ["json_sanitizer_test.cc"],
+    deps = [
+        "//source/common/json:json_sanitizer_lib",
+        "//source/common/protobuf:utility_lib",
+    ],
+)
+
+envoy_cc_benchmark_binary(
+    name = "json_sanitizer_speed_test",
+    srcs = ["json_sanitizer_speed_test.cc"],
+    deps = [
+        "//source/common/json:json_sanitizer_lib",
+        "//source/common/protobuf:utility_lib",
+    ],
+)
diff --git a/test/common/json/json_sanitizer_speed_test.cc b/test/common/json/json_sanitizer_speed_test.cc
@@ -0,0 +1,53 @@
+#include "source/common/json/json_sanitizer.h"
+#include "source/common/protobuf/utility.h"
+
+#include "benchmark/benchmark.h"
+
+// NOLINT(namespace-envoy)
+
+constexpr absl::string_view pass_through_encoding = "Now is the time for all good men";
+constexpr absl::string_view escaped_encoding = "Now <is the \"time\"> for all good men";
+
+// NOLINTNEXTLINE(readability-identifier-naming)
+static void BM_ProtoEncoderNoEscape(benchmark::State& state) {
+  const std::string str = std::string(pass_through_encoding);
+
+  for (auto _ : state) { // NOLINT
+    Envoy::MessageUtil::getJsonStringFromMessageOrDie(Envoy::ValueUtil::stringValue(str), false,
+                                                      true);
+  }
+}
+BENCHMARK(BM_ProtoEncoderNoEscape);
+
+// NOLINTNEXTLINE(readability-identifier-naming)
+static void BM_JsonSanitizerNoEscape(benchmark::State& state) {
+  Envoy::Json::JsonSanitizer sanitizer;
+  std::string buffer;
+
+  for (auto _ : state) { // NOLINT
+    sanitizer.sanitize(buffer, pass_through_encoding);
+  }
+}
+BENCHMARK(BM_JsonSanitizerNoEscape);
+
+// NOLINTNEXTLINE(readability-identifier-naming)
+static void BM_ProtoEncoderWithEscape(benchmark::State& state) {
+  const std::string str = std::string(escaped_encoding);
+
+  for (auto _ : state) { // NOLINT
+    Envoy::MessageUtil::getJsonStringFromMessageOrDie(Envoy::ValueUtil::stringValue(str), false,
+                                                      true);
+  }
+}
+BENCHMARK(BM_ProtoEncoderWithEscape);
+
+// NOLINTNEXTLINE(readability-identifier-naming)
+static void BM_JsonSanitizerWithEscape(benchmark::State& state) {
+  Envoy::Json::JsonSanitizer sanitizer;
+  std::string buffer;
+
+  for (auto _ : state) { // NOLINT
+    sanitizer.sanitize(buffer, escaped_encoding);
+  }
+}
+BENCHMARK(BM_JsonSanitizerWithEscape);
diff --git a/test/common/json/json_sanitizer_test.cc b/test/common/json/json_sanitizer_test.cc
@@ -0,0 +1,96 @@
+#include "source/common/json/json_sanitizer.h"
+#include "source/common/protobuf/utility.h"
+
+#include "gtest/gtest.h"
+
+namespace Envoy {
+namespace Json {
+namespace {
+
+class JsonSanitizerTest : public testing::Test {
+protected:
+  absl::string_view sanitizeAndCheckAgainstProtobufJson(absl::string_view str) {
+    absl::string_view hand_sanitized = sanitizer_.sanitize(buffer_, str);
+    std::string proto_sanitized = MessageUtil::getJsonStringFromMessageOrDie(
+        ValueUtil::stringValue(std::string(str)), false, true);
+    EXPECT_EQ(proto_sanitized, absl::StrCat("\"", hand_sanitized, "\"")) << "str=" << str;
+    return hand_sanitized;
+  }
+
+  void expectUnchanged(absl::string_view str) {
+    EXPECT_EQ(str, sanitizeAndCheckAgainstProtobufJson(str));
+  }
+
+  JsonSanitizer sanitizer_;
+  std::string buffer_;
+};
+
+TEST_F(JsonSanitizerTest, Empty) { expectUnchanged(""); }
+
+TEST_F(JsonSanitizerTest, NoEscape) {
+  expectUnchanged("abcdefghijklmnopqrstuvwxyz");
+  expectUnchanged("ABCDEFGHIJKLMNOPQRSTUVWXYZ");
+  expectUnchanged("1234567890");
+  expectUnchanged(" `~!@#$%^&*()_+-={}|[]");
+}
+
+TEST_F(JsonSanitizerTest, SlashChars) {
+  EXPECT_EQ("\\b", sanitizeAndCheckAgainstProtobufJson("\b"));
+  EXPECT_EQ("\\f", sanitizeAndCheckAgainstProtobufJson("\f"));
+  EXPECT_EQ("\\n", sanitizeAndCheckAgainstProtobufJson("\n"));
+  EXPECT_EQ("\\r", sanitizeAndCheckAgainstProtobufJson("\r"));
+  EXPECT_EQ("\\t", sanitizeAndCheckAgainstProtobufJson("\t"));
+  EXPECT_EQ("\\\\", sanitizeAndCheckAgainstProtobufJson("\\"));
+  EXPECT_EQ("\\\"", sanitizeAndCheckAgainstProtobufJson("\""));
+}
+
+TEST_F(JsonSanitizerTest, ControlChars) {
+  EXPECT_EQ("\\u0001", sanitizeAndCheckAgainstProtobufJson("\001"));
+  EXPECT_EQ("\\u0002", sanitizeAndCheckAgainstProtobufJson("\002"));
+  EXPECT_EQ("\\b", sanitizeAndCheckAgainstProtobufJson("\010"));
+  EXPECT_EQ("\\t", sanitizeAndCheckAgainstProtobufJson("\011"));
+  EXPECT_EQ("\\n", sanitizeAndCheckAgainstProtobufJson("\012"));
+  EXPECT_EQ("\\u000b", sanitizeAndCheckAgainstProtobufJson("\013"));
+  EXPECT_EQ("\\f", sanitizeAndCheckAgainstProtobufJson("\014"));
+  EXPECT_EQ("\\r", sanitizeAndCheckAgainstProtobufJson("\015"));
+  EXPECT_EQ("\\u000e", sanitizeAndCheckAgainstProtobufJson("\016"));
+  EXPECT_EQ("\\u000f", sanitizeAndCheckAgainstProtobufJson("\017"));
+  EXPECT_EQ("\\u0010", sanitizeAndCheckAgainstProtobufJson("\020"));
+  EXPECT_EQ("\\u003c", sanitizeAndCheckAgainstProtobufJson("<"));
+  EXPECT_EQ("\\u003e", sanitizeAndCheckAgainstProtobufJson(">"));
+}
+
+TEST_F(JsonSanitizerTest, SevenBitAscii) {
+  // Cover all the 7-bit ascii values, calling sanitize so that it checks
+  // our hand-rolled sanitizer vs protobuf. We ignore the return-value of
+  // sanitize(); we are just calling for it to test against protobuf.
+  for (uint32_t i = 0; i < 128; ++i) {
+    char c = i;
+    sanitizeAndCheckAgainstProtobufJson(absl::string_view(&c, 1));
+  }
+}
+
+TEST_F(JsonSanitizerTest, Utf8) {
+  // reference; https://www.charset.org/utf-8
+  auto unicode = [](std::vector<uint8_t> chars) -> std::string {
+    return std::string(reinterpret_cast<const char*>(&chars[0]), chars.size());
+  };
+
+  expectUnchanged(unicode({0xc2, 0xa2})); // Cent.
+  expectUnchanged(unicode({0xc2, 0xa9})); // Copyright.
+  expectUnchanged(unicode({0xc3, 0xa0})); // 'a' with accent grave.
+}
+
+TEST_F(JsonSanitizerTest, Interspersed) {
+  EXPECT_EQ("a\\bc", sanitizeAndCheckAgainstProtobufJson("a\bc"));
+  EXPECT_EQ("a\\b\\fc", sanitizeAndCheckAgainstProtobufJson("a\b\fc"));
+  EXPECT_EQ("\\bac", sanitizeAndCheckAgainstProtobufJson("\bac"));
+  EXPECT_EQ("\\b\\fac", sanitizeAndCheckAgainstProtobufJson("\b\fac"));
+  EXPECT_EQ("ac\\b", sanitizeAndCheckAgainstProtobufJson("ac\b"));
+  EXPECT_EQ("ac\\b", sanitizeAndCheckAgainstProtobufJson("ac\b"));
+  EXPECT_EQ("\\ra\\f", sanitizeAndCheckAgainstProtobufJson("\ra\f"));
+}
+
+} // namespace
+} // namespace Json
+} // namespace Envoy
diff --git a/tools/spelling/spelling_dictionary.txt b/tools/spelling/spelling_dictionary.txt
@@ -1099,6 +1099,7 @@ rver
 rxhash
 sandboxed
 sanitization
+sanitizations
 sanitizer
 satisfiable
 scalability