envoyproxy · mattklein123 · Apr 12, 2019 · Mar 29, 2019 · Mar 29, 2019 · Mar 29, 2019
diff --git a/api/envoy/config/filter/network/http_connection_manager/v2/http_connection_manager.proto b/api/envoy/config/filter/network/http_connection_manager/v2/http_connection_manager.proto
@@ -200,8 +200,14 @@ message HttpConnectionManager {
 
   // The delayed close timeout is for downstream connections managed by the HTTP connection manager.
   // It is defined as a grace period after connection close processing has been locally initiated
-  // during which Envoy will flush the write buffers for the connection and await the peer to close
-  // (i.e., a TCP FIN/RST is received by Envoy from the downstream connection).
+  // during which Envoy will wait for the peer to close (i.e., a TCP FIN/RST is received by Envoy
+  // from the downstream connection) prior to Envoy closing the socket associated with that
+  // connection.
+  // NOTE: This timeout is enforced even when the socket associated with the downstream connection
+  // is pending a flush of the write buffer. However, any progress made writing data to the socket
+  // will restart the timer associated with this timeout. This means that the total grace period for
+  // a socket in this state will be
+  // <total_time_waiting_for_write_buffer_flushes>+<delayed_close_timeout>.
   //
   // Delaying Envoy's connection close and giving the peer the opportunity to initiate the close
   // sequence mitigates a race condition that exists when downstream clients do not drain/process
@@ -213,8 +219,15 @@ message HttpConnectionManager {
   //
   // The default timeout is 1000 ms if this option is not specified.
   //
-  // A value of 0 will completely disable delayed close processing, and the downstream connection's
-  // socket will be closed immediately after the write flush is completed.
+  // .. NOTE::
+  //    To be useful in avoiding the race condition described above, this timeout must be set
+  //    to *at least* <max round trip time expected between clients and Envoy>+<100ms to account for
+  //    a reasonsable "worst" case processing time for a full iteration of Envoy's event loop>.
+  //
+  // .. WARNING::
+  //    A value of 0 will completely disable delayed close processing. When disabled, the downstream
+  //    connection's socket will be closed immediately after the write flush is completed or will
+  //    never close if the write flush does not complete.
   google.protobuf.Duration delayed_close_timeout = 26 [(gogoproto.stdduration) = true];
 
   // Configuration for :ref:`HTTP access logs <arch_overview_access_logs>`

diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst
@@ -4,6 +4,7 @@ Version history
 1.11.0 (Pending)
 ================
 * dubbo_proxy: support the :ref:`Dubbo proxy filter <config_network_filters_dubbo_proxy>`.
+* http: mitigated a race condition with the :ref:`delayed_close_timeout<envoy_api_field_config.filter.network.http_connection_manager.v2.HttpConnectionManager.delayed_close_timeout>` where it could trigger while actively flushing a pending write buffer for a downstream connection.
 * upstream: added :ref:`upstream_cx_pool_overflow <config_cluster_manager_cluster_stats>` for the connection pool circuit breaker.
 
 1.10.0 (Apr 5, 2019)

diff --git a/include/envoy/network/connection.h b/include/envoy/network/connection.h
@@ -254,6 +254,7 @@ class Connection : public Event::DeferredDeletable, public FilterManager {
 
   /**
    * Set the timeout for delayed connection close()s.
+   * This can only be called prior to issuing a close() on the connection.
    * @param timeout The timeout value in milliseconds
    */
   virtual void setDelayedCloseTimeout(std::chrono::milliseconds timeout) PURE;

diff --git a/source/common/network/connection_impl.cc b/source/common/network/connection_impl.cc
@@ -99,6 +99,7 @@ void ConnectionImpl::close(ConnectionCloseType type) {
 
   uint64_t data_to_write = write_buffer_->length();
   ENVOY_CONN_LOG(debug, "closing data_to_write={} type={}", *this, data_to_write, enumToInt(type));
+  const bool delayed_close_timeout_set = delayedCloseTimeout().count() > 0;
   if (data_to_write == 0 || type == ConnectionCloseType::NoFlush ||
       !transport_socket_->canFlushClose()) {
     if (data_to_write > 0) {
@@ -107,13 +108,25 @@ void ConnectionImpl::close(ConnectionCloseType type) {
       transport_socket_->doWrite(*write_buffer_, true);
     }
 
-    closeSocket(ConnectionEvent::LocalClose);
+    if (type == ConnectionCloseType::FlushWriteAndDelay && delayed_close_timeout_set) {
+      // The socket is being closed and either there is no more data to write or the data can not be
+      // flushed (!transport_socket_->canFlushClose()). Since a delayed close has been requested,
+      // start the delayed close timer if it hasn't been done already by a previous close().
+      // NOTE: Even though the delayed_close_state_ is being set to CloseAfterFlushAndWait, since
+      // a write event is not being registered for the socket, this logic is simply setting the
+      // timer and waiting for it to trigger to close the socket.
+      if (!inDelayedClose()) {
+        initializeDelayedCloseTimer();
+        delayed_close_state_ = DelayedCloseState::CloseAfterFlushAndWait;
+      }
+    } else {
+      closeSocket(ConnectionEvent::LocalClose);
+    }
   } else {
     ASSERT(type == ConnectionCloseType::FlushWrite ||
            type == ConnectionCloseType::FlushWriteAndDelay);
 
-    // No need to continue if a FlushWrite/FlushWriteAndDelay has already been issued and there is a
-    // pending delayed close.
+    // If there is a pending delayed close, simply update the delayed close state.
     //
     // An example of this condition manifests when a downstream connection is closed early by Envoy,
     // such as when a route can't be matched:
@@ -123,35 +136,31 @@ void ConnectionImpl::close(ConnectionCloseType type) {
     //          ConnectionManagerImpl::checkForDeferredClose()
     //     2) A second close is issued by a subsequent call to
     //        ConnectionManagerImpl::checkForDeferredClose() prior to returning from onData()
-    if (delayed_close_) {
+    if (inDelayedClose()) {
+      // Validate that a delayed close timer is already enabled unless it was disabled via
+      // configuration.
+      ASSERT(!delayed_close_timeout_set || delayed_close_timer_ != nullptr);
+      if (type == ConnectionCloseType::FlushWrite || !delayed_close_timeout_set) {
+        delayed_close_state_ = DelayedCloseState::CloseAfterFlush;
+      } else {
+        delayed_close_state_ = DelayedCloseState::CloseAfterFlushAndWait;
+      }
       return;
     }
 
-    delayed_close_ = true;
-    const bool delayed_close_timeout_set = delayedCloseTimeout().count() > 0;
-
-    // NOTE: the delayed close timeout (if set) affects both FlushWrite and FlushWriteAndDelay
-    // closes:
-    //   1. For FlushWrite, the timeout sets an upper bound on how long to wait for the flush to
-    //   complete before the connection is locally closed.
-    //   2. For FlushWriteAndDelay, the timeout specifies an upper bound on how long to wait for the
-    //   flush to complete and the peer to close the connection before it is locally closed.
-
     // All close types that follow do not actually close() the socket immediately so that buffered
     // data can be written. However, we do want to stop reading to apply TCP backpressure.
     read_enabled_ = false;
 
-    // Force a closeSocket() after the write buffer is flushed if the close_type calls for it or if
-    // no delayed close timeout is set.
-    close_after_flush_ = !delayed_close_timeout_set || type == ConnectionCloseType::FlushWrite;
-
-    // Create and activate a timer which will immediately close the connection if triggered.
-    // A config value of 0 disables the timeout.
+    // NOTE: At this point, it's already been validated that the connection is not already in
+    // delayed close processing and therefore the timer has not yet been created.
     if (delayed_close_timeout_set) {
-      delayed_close_timer_ = dispatcher_.createTimer([this]() -> void { onDelayedCloseTimeout(); });
-      ENVOY_CONN_LOG(debug, "setting delayed close timer with timeout {} ms", *this,
-                     delayedCloseTimeout().count());
-      delayed_close_timer_->enableTimer(delayedCloseTimeout());
+      initializeDelayedCloseTimer();
+      delayed_close_state_ = (type == ConnectionCloseType::FlushWrite)
+                                 ? DelayedCloseState::CloseAfterFlush
+                                 : DelayedCloseState::CloseAfterFlushAndWait;
+    } else {
+      delayed_close_state_ = DelayedCloseState::CloseAfterFlush;
     }
 
     file_event_->setEnabled(Event::FileReadyType::Write |
@@ -162,7 +171,7 @@ void ConnectionImpl::close(ConnectionCloseType type) {
 Connection::State ConnectionImpl::state() const {
   if (!ioHandle().isOpen()) {
     return State::Closed;
-  } else if (delayed_close_) {
+  } else if (inDelayedClose()) {
     return State::Closing;
   } else {
     return State::Open;
@@ -534,21 +543,37 @@ void ConnectionImpl::onWriteReady() {
   uint64_t new_buffer_size = write_buffer_->length();
   updateWriteBufferStats(result.bytes_processed_, new_buffer_size);
 
+  // NOTE: If the delayed_close_timer_ is set, it must only trigger after a delayed_close_timeout_
+  // period of inactivity from the last write event. Therefore, the timer must be reset to its
+  // original timeout value unless the socket is going to be closed as a result of the doWrite().
+
   if (result.action_ == PostIoAction::Close) {
     // It is possible (though unlikely) for the connection to have already been closed during the
     // write callback. This can happen if we manage to complete the SSL handshake in the write
     // callback, raise a connected event, and close the connection.
     closeSocket(ConnectionEvent::RemoteClose);
-  } else if ((close_after_flush_ && new_buffer_size == 0) || bothSidesHalfClosed()) {
+  } else if ((inDelayedClose() && new_buffer_size == 0) || bothSidesHalfClosed()) {
     ENVOY_CONN_LOG(debug, "write flush complete", *this);
-    closeSocket(ConnectionEvent::LocalClose);
-  } else if (result.action_ == PostIoAction::KeepOpen && result.bytes_processed_ > 0) {
-    for (BytesSentCb& cb : bytes_sent_callbacks_) {
-      cb(result.bytes_processed_);
-
-      // If a callback closes the socket, stop iterating.
-      if (!ioHandle().isOpen()) {
-        return;
+    if (delayed_close_state_ == DelayedCloseState::CloseAfterFlushAndWait) {
+      ASSERT(delayed_close_timer_ != nullptr);
+      delayed_close_timer_->enableTimer(delayedCloseTimeout());
+    } else {
+      ASSERT(bothSidesHalfClosed() || delayed_close_state_ == DelayedCloseState::CloseAfterFlush);
+      closeSocket(ConnectionEvent::LocalClose);
+    }
+  } else {
+    ASSERT(result.action_ == PostIoAction::KeepOpen);
+    if (delayed_close_timer_ != nullptr) {
+      delayed_close_timer_->enableTimer(delayedCloseTimeout());
+    }
+    if (result.bytes_processed_ > 0) {
+      for (BytesSentCb& cb : bytes_sent_callbacks_) {
+        cb(result.bytes_processed_);
+
+        // If a callback closes the socket, stop iterating.
+        if (!ioHandle().isOpen()) {
+          return;
+        }
       }
     }
   }
@@ -587,13 +612,22 @@ bool ConnectionImpl::bothSidesHalfClosed() {
 }
 
 void ConnectionImpl::onDelayedCloseTimeout() {
+  delayed_close_timer_.reset();
   ENVOY_CONN_LOG(debug, "triggered delayed close", *this);
   if (connection_stats_ != nullptr && connection_stats_->delayed_close_timeouts_ != nullptr) {
     connection_stats_->delayed_close_timeouts_->inc();
   }
   closeSocket(ConnectionEvent::LocalClose);
 }
 
+void ConnectionImpl::initializeDelayedCloseTimer() {
+  const auto timeout = delayedCloseTimeout().count();
+  ASSERT(delayed_close_timer_ == nullptr && timeout > 0);
+  delayed_close_timer_ = dispatcher_.createTimer([this]() -> void { onDelayedCloseTimeout(); });
+  ENVOY_CONN_LOG(debug, "setting delayed close timer with timeout {} ms", *this, timeout);
+  delayed_close_timer_->enableTimer(delayedCloseTimeout());
+}
+
 absl::string_view ConnectionImpl::transportFailureReason() const {
   return transport_socket_->failureReason();
 }

diff --git a/source/common/network/connection_impl.h b/source/common/network/connection_impl.h
@@ -122,6 +122,8 @@ class ConnectionImpl : public virtual Connection,
   static uint64_t nextGlobalIdForTest() { return next_global_id_; }
 
   void setDelayedCloseTimeout(std::chrono::milliseconds timeout) override {
+    // Validate that this is only called prior to issuing a close() or closeSocket().
+    ASSERT(delayed_close_timer_ == nullptr && ioHandle().isOpen());
     delayed_close_timeout_ = timeout;
   }
   std::chrono::milliseconds delayedCloseTimeout() const override { return delayed_close_timeout_; }
@@ -167,16 +169,32 @@ class ConnectionImpl : public virtual Connection,
   // Callback issued when a delayed close timeout triggers.
   void onDelayedCloseTimeout();
 
+  void initializeDelayedCloseTimer();
+  bool inDelayedClose() const { return delayed_close_state_ != DelayedCloseState::None; }
+
   static std::atomic<uint64_t> next_global_id_;
 
+  // States associated with delayed closing of the connection (i.e., when the underlying socket is
+  // not immediately close()d as a result of a ConnectionImpl::close()).
+  enum class DelayedCloseState {
+    None,
+    // The socket will be closed immediately after the buffer is flushed _or_ if a period of
+    // inactivity after the last write event greater than or equal to delayed_close_timeout_ has
+    // elapsed.
+    CloseAfterFlush,
+    // The socket will be closed after a grace period of delayed_close_timeout_ has elapsed after
+    // the socket is flushed _or_ if a period of inactivity after the last write event greater than
+    // or equal to delayed_close_timeout_ has elapsed.
+    CloseAfterFlushAndWait
+  };
+  DelayedCloseState delayed_close_state_{DelayedCloseState::None};
+
   Event::Dispatcher& dispatcher_;
   const uint64_t id_;
   Event::TimerPtr delayed_close_timer_;
   std::list<ConnectionCallbacks*> callbacks_;
   std::list<BytesSentCb> bytes_sent_callbacks_;
   bool read_enabled_{true};
-  bool close_after_flush_{false};
-  bool delayed_close_{false};
   bool above_high_watermark_{false};
   bool detect_early_close_{true};
   bool enable_half_close_{false};