envoyproxy · mattklein123 · Jun 28, 2020 · Jun 8, 2020 · Jun 8, 2020 · Jun 8, 2020
diff --git a/docs/root/configuration/observability/statistics.rst b/docs/root/configuration/observability/statistics.rst
@@ -29,6 +29,7 @@ Server related statistics are rooted at *server.* with following statistics:
   hot_restart_generation, Gauge, Current hot restart generation -- like hot_restart_epoch but computed automatically by incrementing from parent.
   initialization_time_ms, Histogram, Total time taken for Envoy initialization in milliseconds. This is the time from server start-up until the worker threads are ready to accept new connections
   debug_assertion_failures, Counter, Number of debug assertion failures detected in a release build if compiled with `--define log_debug_assert_in_release=enabled` or zero otherwise
+  envoy_bug_failures, Counter, Number of envoy bug failures detected in a release build. File or report the issue if this increments as this may be serious.
   static_unknown_fields, Counter, Number of messages in static configuration with unknown fields
   dynamic_unknown_fields, Counter, Number of messages in dynamic configuration with unknown fields
 
diff --git a/docs/root/version_history/current.rst b/docs/root/version_history/current.rst
@@ -119,6 +119,7 @@ New Features
   <envoy_v3_api_field_config.route.v3.RouteAction.internal_redirect_policy>` field.
 * runtime: add new gauge :ref:`deprecated_feature_seen_since_process_start <runtime_stats>` that gets reset across hot restarts.
 * server: add the option :option:`--drain-strategy` to enable different drain strategies for DrainManager::drainClose().
+* server: added :ref:`server.envoy_bug_failures <server_statistics>` statistic to count ENVOY_BUG failures.
 * stats: added the option to :ref:`report counters as deltas <envoy_v3_api_field_config.metrics.v3.MetricsServiceConfig.report_counters_as_deltas>` to the metrics service stats sink.
 * tracing: tracing configuration has been made fully dynamic and every HTTP connection manager
   can now have a separate :ref:`tracing provider <envoy_v3_api_field_extensions.filters.network.http_connection_manager.v3.HttpConnectionManager.Tracing.provider>`.

diff --git a/source/common/common/BUILD b/source/common/common/BUILD
@@ -18,6 +18,10 @@ envoy_cc_library(
     name = "assert_lib",
     srcs = ["assert.cc"],
     hdrs = ["assert.h"],
+    external_deps = [
+        "abseil_base",
+        "abseil_synchronization",
+    ],
     deps = [":minimal_logger_lib"],
 )
 

diff --git a/source/common/common/assert.cc b/source/common/common/assert.cc
@@ -1,5 +1,9 @@
 #include "common/common/assert.h"
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/strings/str_join.h"
+#include "absl/synchronization/mutex.h"
+
 namespace Envoy {
 namespace Assert {
 
@@ -28,15 +32,84 @@ class ActionRegistrationImpl : public ActionRegistration {
   static std::function<void()> debug_assertion_failure_record_action_;
 };
 
+// This class implements the logic for triggering ENVOY_BUG logs and actions. Logging and actions
+// will be triggered with exponential back-off per file and line bug.
+class EnvoyBugRegistrationImpl : public ActionRegistration {
+public:
+  EnvoyBugRegistrationImpl(std::function<void()> action) {
+    ASSERT(envoy_bug_failure_record_action_ == nullptr,
+           "An ENVOY_BUG action was already set. Currently only a single action is supported.");
+    envoy_bug_failure_record_action_ = action;
+    counters_.clear();
+  }
+
+  ~EnvoyBugRegistrationImpl() override {
+    ASSERT(envoy_bug_failure_record_action_ != nullptr);
+    envoy_bug_failure_record_action_ = nullptr;
+  }
+
+  // This method is invoked when an ENVOY_BUG condition fails. It increments a per file and line
+  // counter for every ENVOY_BUG hit in a mutex guarded map.
+  // The implementation may also be a inline static counter per-file and line. There is no benchmark
+  // to show that the performance of this mutex is any worse than atomic counters. Acquiring and
+  // releasing a mutex is cheaper than a cache miss, but the mutex here is contended for every
+  // ENVOY_BUG failure rather than per individual bug. Logging ENVOY_BUGs is not a performance
+  // critical path, and mutex contention would indicate that there is a serious failure.
+  // Currently, this choice reduces code size and has the advantage that behavior is easier to
+  // understand and debug, and test behavior is predictable.
+  static bool shouldLogAndInvoke(absl::string_view bug_name) {
+    // Increment counter, inserting first if counter does not exist.
+    uint64_t counter_value = 0;
+    {
+      absl::MutexLock lock(&mutex_);
+      counter_value = ++counters_[bug_name];
+    }
+
+    // Check if counter is power of two by its bitwise representation.
+    return (counter_value & (counter_value - 1)) == 0;
+  }
+
+  static void invokeAction() {
+    if (envoy_bug_failure_record_action_ != nullptr) {
+      envoy_bug_failure_record_action_();
+    }
+  }
+
+private:
+  // This implementation currently only handles one action being set at a time. This is currently
+  // sufficient. If multiple actions are ever needed, the actions should be chained when
+  // additional actions are registered.
+  static std::function<void()> envoy_bug_failure_record_action_;
+
+  using EnvoyBugMap = absl::flat_hash_map<std::string, uint64_t>;
+  static absl::Mutex mutex_;
+  static EnvoyBugMap counters_ GUARDED_BY(mutex_);
+};
+
 std::function<void()> ActionRegistrationImpl::debug_assertion_failure_record_action_;
+std::function<void()> EnvoyBugRegistrationImpl::envoy_bug_failure_record_action_;
+EnvoyBugRegistrationImpl::EnvoyBugMap EnvoyBugRegistrationImpl::counters_;
+absl::Mutex EnvoyBugRegistrationImpl::mutex_;
 
 ActionRegistrationPtr setDebugAssertionFailureRecordAction(const std::function<void()>& action) {
   return std::make_unique<ActionRegistrationImpl>(action);
 }
 
-void invokeDebugAssertionFailureRecordAction_ForAssertMacroUseOnly() {
+ActionRegistrationPtr setEnvoyBugFailureRecordAction(const std::function<void()>& action) {
+  return std::make_unique<EnvoyBugRegistrationImpl>(action);
+}
+
+void invokeDebugAssertionFailureRecordActionForAssertMacroUseOnly() {
   ActionRegistrationImpl::invokeAction();
 }
 
+void invokeEnvoyBugFailureRecordActionForEnvoyBugMacroUseOnly() {
+  EnvoyBugRegistrationImpl::invokeAction();
+}
+
+bool shouldLogAndInvokeEnvoyBugForEnvoyBugMacroUseOnly(absl::string_view bug_name) {
+  return EnvoyBugRegistrationImpl::shouldLogAndInvoke(bug_name);
+}
+
 } // namespace Assert
 } // namespace Envoy
diff --git a/source/common/common/assert.h b/source/common/common/assert.h
@@ -32,13 +32,44 @@ using ActionRegistrationPtr = std::unique_ptr<ActionRegistration>;
  */
 ActionRegistrationPtr setDebugAssertionFailureRecordAction(const std::function<void()>& action);
 
+/**
+ * Sets an action to be invoked when an ENVOY_BUG failure is detected in a release build. This
+ * action will be invoked each time an ENVOY_BUG failure is detected.
+ *
+ * This function is not thread-safe; concurrent calls to set the action are not allowed.
+ *
+ * The action may be invoked concurrently if two ENVOY_BUGs in different threads fail at the
+ * same time, so the action must be thread-safe.
+ *
+ * This has no effect in debug builds (envoy bug failure aborts the process).
+ *
+ * @param action The action to take when an envoy bug fails.
+ * @return A registration object. The registration is removed when the object is destructed.
+ */
+ActionRegistrationPtr setEnvoyBugFailureRecordAction(const std::function<void()>& action);
+
 /**
  * Invokes the action set by setDebugAssertionFailureRecordAction, or does nothing if
  * no action has been set.
  *
  * This should only be called by ASSERT macros in this file.
  */
-void invokeDebugAssertionFailureRecordAction_ForAssertMacroUseOnly();
+void invokeDebugAssertionFailureRecordActionForAssertMacroUseOnly();
+
+/**
+ * Invokes the action set by setEnvoyBugFailureRecordAction, or does nothing if
+ * no action has been set.
+ *
+ * This should only be called by ENVOY_BUG macros in this file.
+ */
+void invokeEnvoyBugFailureRecordActionForEnvoyBugMacroUseOnly();
+
+/**
+ * Increments power of two counter for EnvoyBugRegistrationImpl.
+ *
+ * This should only be called by ENVOY_BUG macros in this file.
+ */
+bool shouldLogAndInvokeEnvoyBugForEnvoyBugMacroUseOnly(absl::string_view bug_name);
 
 // CONDITION_STR is needed to prevent macros in condition from being expected, which obfuscates
 // the logged failure, e.g., "EAGAIN" vs "11".
@@ -87,7 +118,7 @@ void invokeDebugAssertionFailureRecordAction_ForAssertMacroUseOnly();
 #if !defined(NDEBUG) // If this is a debug build.
 #define ASSERT_ACTION abort()
 #else // If this is not a debug build, but ENVOY_LOG_DEBUG_ASSERT_IN_RELEASE is defined.
-#define ASSERT_ACTION Envoy::Assert::invokeDebugAssertionFailureRecordAction_ForAssertMacroUseOnly()
+#define ASSERT_ACTION Envoy::Assert::invokeDebugAssertionFailureRecordActionForAssertMacroUseOnly()
 #endif // !defined(NDEBUG)
 
 #define _ASSERT_ORIGINAL(X) _ASSERT_IMPL(X, #X, ASSERT_ACTION, "")
@@ -111,7 +142,7 @@ void invokeDebugAssertionFailureRecordAction_ForAssertMacroUseOnly();
 // This non-implementation ensures that its argument is a valid expression that can be statically
 // casted to a bool, but the expression is never evaluated and will be compiled away.
 #define KNOWN_ISSUE_ASSERT _NULL_ASSERT_IMPL
-#endif // defined(ENVOY_DEBUG_KNOWN_ISSUES)
+#endif // defined(ENVOY_DISABLE_KNOWN_ISSUE_ASSERTS)
 
 // If ASSERT is called with one argument, the ASSERT_SELECTOR will return
 // _ASSERT_ORIGINAL and this will call _ASSERT_ORIGINAL(__VA_ARGS__).
@@ -134,6 +165,42 @@ void invokeDebugAssertionFailureRecordAction_ForAssertMacroUseOnly();
     abort();                                                                                       \
   } while (false)
 
+#if !defined(NDEBUG)
+#define ENVOY_BUG_ACTION abort()
+#else
+#define ENVOY_BUG_ACTION Envoy::Assert::invokeEnvoyBugFailureRecordActionForEnvoyBugMacroUseOnly()
+#endif
+
+// These macros are needed to stringify __LINE__ correctly.
+#define STRINGIFY(X) #X
+#define TOSTRING(X) STRINGIFY(X)
+
+// CONDITION_STR is needed to prevent macros in condition from being expected, which obfuscates
+// the logged failure, e.g., "EAGAIN" vs "11".
+// ENVOY_BUG logging and actions are invoked only on power-of-two instances per log line.
+#define _ENVOY_BUG_IMPL(CONDITION, CONDITION_STR, ACTION, DETAILS)                                 \
+  do {                                                                                             \
+    if (!(CONDITION) && Envoy::Assert::shouldLogAndInvokeEnvoyBugForEnvoyBugMacroUseOnly(          \
+                            __FILE__ ":" TOSTRING(__LINE__))) {                                    \
+      const std::string& details = (DETAILS);                                                      \
+      ENVOY_LOG_TO_LOGGER(Envoy::Logger::Registry::getLog(Envoy::Logger::Id::envoy_bug), error,    \
+                          "envoy bug failure: {}.{}{}", CONDITION_STR,                             \
+                          details.empty() ? "" : " Details: ", details);                           \
+      ACTION;                                                                                      \
+    }                                                                                              \
+  } while (false)
+
+#define _ENVOY_BUG_VERBOSE(X, Y) _ENVOY_BUG_IMPL(X, #X, ENVOY_BUG_ACTION, Y)
+
+/**
+ * Indicate a failure condition that should never be met in normal circumstances. In contrast
+ * with ASSERT, an ENVOY_BUG is compiled in release mode. If a failure condition is met in release
+ * mode, it is logged and a stat is incremented with exponential back-off per ENVOY_BUG. In debug
+ * mode, it will crash if the condition is not met. ENVOY_BUG must be called with two arguments for
+ * verbose logging.
+ */
+#define ENVOY_BUG(...) _ENVOY_BUG_VERBOSE(__VA_ARGS__)
+
 // NOT_IMPLEMENTED_GCOVR_EXCL_LINE is for overridden functions that are expressly not implemented.
 // The macro name includes "GCOVR_EXCL_LINE" to exclude the macro's usage from code coverage
 // reports.

diff --git a/source/common/common/logger.h b/source/common/common/logger.h
@@ -34,6 +34,7 @@ namespace Logger {
   FUNCTION(conn_handler)                                                                           \
   FUNCTION(decompression)                                                                          \
   FUNCTION(dubbo)                                                                                  \
+  FUNCTION(envoy_bug)                                                                              \
   FUNCTION(ext_authz)                                                                              \
   FUNCTION(rocketmq)                                                                               \
   FUNCTION(file)                                                                                   \

diff --git a/source/server/server.cc b/source/server/server.cc
@@ -339,6 +339,8 @@ void InstanceImpl::initialize(const Options& options,
 
   assert_action_registration_ = Assert::setDebugAssertionFailureRecordAction(
       [this]() { server_stats_->debug_assertion_failures_.inc(); });
+  envoy_bug_action_registration_ = Assert::setEnvoyBugFailureRecordAction(
+      [this]() { server_stats_->envoy_bug_failures_.inc(); });
 
   InstanceImpl::failHealthcheck(false);
 

diff --git a/source/server/server.h b/source/server/server.h
@@ -54,6 +54,7 @@ namespace Server {
  */
 #define ALL_SERVER_STATS(COUNTER, GAUGE, HISTOGRAM)                                                \
   COUNTER(debug_assertion_failures)                                                                \
+  COUNTER(envoy_bug_failures)                                                                      \
   COUNTER(dynamic_unknown_fields)                                                                  \
   COUNTER(static_unknown_fields)                                                                   \
   GAUGE(concurrency, NeverImport)                                                                  \
@@ -318,6 +319,7 @@ class InstanceImpl final : Logger::Loggable<Logger::Id::main>,
   Stats::StoreRoot& stats_store_;
   std::unique_ptr<ServerStats> server_stats_;
   Assert::ActionRegistrationPtr assert_action_registration_;
+  Assert::ActionRegistrationPtr envoy_bug_action_registration_;
   ThreadLocal::Instance& thread_local_;
   Api::ApiPtr api_;
   Event::DispatcherPtr dispatcher_;

diff --git a/test/common/common/assert_test.cc b/test/common/common/assert_test.cc
@@ -42,4 +42,32 @@ TEST(AssertDeathTest, VariousLogs) {
   EXPECT_EQ(expected_counted_failures, assert_fail_count);
 }
 
+TEST(EnvoyBugDeathTest, VariousLogs) {
+  int envoy_bug_fail_count = 0;
+  // ENVOY_BUG actions only occur on power of two counts.
+  auto envoy_bug_action_registration =
+      Assert::setEnvoyBugFailureRecordAction([&]() { envoy_bug_fail_count++; });
+
+#ifndef NDEBUG
+  EXPECT_DEATH({ ENVOY_BUG(false, ""); }, ".*envoy bug failure: false.*");
+  EXPECT_DEATH({ ENVOY_BUG(false, ""); }, ".*envoy bug failure: false.*");
+  EXPECT_DEATH({ ENVOY_BUG(false, "With some logs"); },
+               ".*envoy bug failure: false. Details: With some logs.*");
+  EXPECT_EQ(0, envoy_bug_fail_count);
+#else
+  // Same log lines trigger exponential back-off.
+  for (int i = 0; i < 4; i++) {
+    ENVOY_BUG(false, "");
+  }
+  // 3 counts because 1st, 2nd, and 4th instances are powers of 2.
+  EXPECT_EQ(3, envoy_bug_fail_count);
+
+  // Different log lines have separate counters for exponential back-off.
+  EXPECT_LOG_CONTAINS("error", "envoy bug failure: false", ENVOY_BUG(false, ""));
+  EXPECT_LOG_CONTAINS("error", "envoy bug failure: false. Details: With some logs",
+                      ENVOY_BUG(false, "With some logs"));
+  EXPECT_EQ(5, envoy_bug_fail_count);
+#endif
+}
+
 } // namespace Envoy
diff --git a/test/server/server_test.cc b/test/server/server_test.cc
@@ -474,7 +474,22 @@ TEST_P(ServerInstanceImplTest, Stats) {
   EXPECT_EQ(2L, TestUtility::findGauge(stats_store_, "server.concurrency")->value());
   EXPECT_EQ(3L, TestUtility::findGauge(stats_store_, "server.hot_restart_epoch")->value());
 
-// This stat only works in this configuration.
+// The ENVOY_BUG stat works in release mode.
+#if defined(NDEBUG)
+  // Test exponential back-off on a fixed line ENVOY_BUG.
+  for (int i = 0; i < 16; i++) {
+    ENVOY_BUG(false, "");
+  }
+  EXPECT_EQ(5L, TestUtility::findCounter(stats_store_, "server.envoy_bug_failures")->value());
+  // Another ENVOY_BUG increments the counter.
+  ENVOY_BUG(false, "Testing envoy bug assertion failure detection in release build.");
+  EXPECT_EQ(6L, TestUtility::findCounter(stats_store_, "server.envoy_bug_failures")->value());
+#else
+  // The ENVOY_BUG macro aborts in debug mode.
+  EXPECT_DEATH(ENVOY_BUG(false, ""), "");
+#endif
+
+// The ASSERT stat only works in this configuration.
 #if defined(NDEBUG) && defined(ENVOY_LOG_DEBUG_ASSERT_IN_RELEASE)
   ASSERT(false, "Testing debug assertion failure detection in release build.");
   EXPECT_EQ(1L, TestUtility::findCounter(stats_store_, "server.debug_assertion_failures")->value());