envoyproxy · mattklein123 · Apr 12, 2019 · Mar 29, 2019 · Mar 29, 2019 · Mar 29, 2019
diff --git a/api/envoy/config/filter/network/http_connection_manager/v2/http_connection_manager.proto b/api/envoy/config/filter/network/http_connection_manager/v2/http_connection_manager.proto
@@ -200,8 +200,14 @@ message HttpConnectionManager {
 
   // The delayed close timeout is for downstream connections managed by the HTTP connection manager.
   // It is defined as a grace period after connection close processing has been locally initiated
-  // during which Envoy will flush the write buffers for the connection and await the peer to close
-  // (i.e., a TCP FIN/RST is received by Envoy from the downstream connection).
+  // during which Envoy will wait for the peer to close (i.e., a TCP FIN/RST is received by Envoy
+  // from the downstream connection) prior to Envoy closing the socket associated with that
+  // connection.
+  // NOTE: This timeout is enforced even when the socket associated with the downstream connection
+  // is pending a flush of the write buffer. However, any progress made writing data to the socket
+  // will restart the timer associated with this timeout. This means that the total grace period for
+  // a socket in this state will be
+  // <delayed_close_timeout>+<total_time_waiting_for_write_buffer_flushes>.
   //
   // Delaying Envoy's connection close and giving the peer the opportunity to initiate the close
   // sequence mitigates a race condition that exists when downstream clients do not drain/process

diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst
@@ -47,6 +47,7 @@ Version history
 * http: added :ref:`max request headers size <envoy_api_field_config.filter.network.http_connection_manager.v2.HttpConnectionManager.max_request_headers_kb>`. The default behaviour is unchanged.
 * http: added modifyDecodingBuffer/modifyEncodingBuffer to allow modifying the buffered request/response data.
 * http: added encodeComplete/decodeComplete. These are invoked at the end of the stream, after all data has been encoded/decoded respectively. Default implementation is a no-op.
+* http: fixed a bug with the :ref:`delayed_close_timeout<envoy_api_field_config.filter.network.http_connection_manager.v2.HttpConnectionManager.delayed_close_timeout>` where it could trigger while actively flushing a pending write buffer for a downstream connection.
 * outlier_detection: added support for :ref:`outlier detection event protobuf-based logging <arch_overview_outlier_detection_logging>`.
 * mysql: added a MySQL proxy filter that is capable of parsing SQL queries over MySQL wire protocol. Refer to :ref:`MySQL proxy<config_network_filters_mysql_proxy>` for more details.
 * performance: new buffer implementation (disabled by default; to test it, add "--use-libevent-buffers 0" to the command-line arguments when starting Envoy).

diff --git a/source/common/network/connection_impl.cc b/source/common/network/connection_impl.cc
@@ -99,6 +99,7 @@ void ConnectionImpl::close(ConnectionCloseType type) {
 
   uint64_t data_to_write = write_buffer_->length();
   ENVOY_CONN_LOG(debug, "closing data_to_write={} type={}", *this, data_to_write, enumToInt(type));
+  const bool delayed_close_timeout_set = delayedCloseTimeout().count() > 0;
   if (data_to_write == 0 || type == ConnectionCloseType::NoFlush ||
       !transport_socket_->canFlushClose()) {
     if (data_to_write > 0) {
@@ -107,13 +108,22 @@ void ConnectionImpl::close(ConnectionCloseType type) {
       transport_socket_->doWrite(*write_buffer_, true);
     }
 
-    closeSocket(ConnectionEvent::LocalClose);
+    if (type == ConnectionCloseType::FlushWriteAndDelay && delayed_close_timeout_set) {
+      // The socket is being closed and there is no more data to write. Since a delayed close has
+      // been requested, start the delayed close timer if it hasn't been done already by a previous
+      // close().
+      if (!inDelayedClose()) {
+        initializeDelayedCloseTimer();
+        delayed_close_state_ = DelayedCloseState::CloseAfterFlushAndTimeout;
+      }
+    } else {
+      closeSocket(ConnectionEvent::LocalClose);
+    }
   } else {
     ASSERT(type == ConnectionCloseType::FlushWrite ||
            type == ConnectionCloseType::FlushWriteAndDelay);
 
-    // No need to continue if a FlushWrite/FlushWriteAndDelay has already been issued and there is a
-    // pending delayed close.
+    // If there is a pending delayed close, simply update the delayed close state.
     //
     // An example of this condition manifests when a downstream connection is closed early by Envoy,
     // such as when a route can't be matched:
@@ -123,35 +133,29 @@ void ConnectionImpl::close(ConnectionCloseType type) {
     //          ConnectionManagerImpl::checkForDeferredClose()
     //     2) A second close is issued by a subsequent call to
     //        ConnectionManagerImpl::checkForDeferredClose() prior to returning from onData()
-    if (delayed_close_) {
+    if (inDelayedClose()) {
+      // Validate that a delayed close timer is already enabled unless it was disabled via
+      // configuration.
+      ASSERT(!delayed_close_timeout_set || delayed_close_timer_ != nullptr);
+      if (type == ConnectionCloseType::FlushWrite || !delayed_close_timeout_set) {
+        delayed_close_state_ = DelayedCloseState::CloseAfterFlush;
+      } else {
+        delayed_close_state_ = DelayedCloseState::CloseAfterFlushAndTimeout;
+      }
       return;
     }
 
-    delayed_close_ = true;
-    const bool delayed_close_timeout_set = delayedCloseTimeout().count() > 0;
-
-    // NOTE: the delayed close timeout (if set) affects both FlushWrite and FlushWriteAndDelay
-    // closes:
-    //   1. For FlushWrite, the timeout sets an upper bound on how long to wait for the flush to
-    //   complete before the connection is locally closed.
-    //   2. For FlushWriteAndDelay, the timeout specifies an upper bound on how long to wait for the
-    //   flush to complete and the peer to close the connection before it is locally closed.
-
     // All close types that follow do not actually close() the socket immediately so that buffered
     // data can be written. However, we do want to stop reading to apply TCP backpressure.
     read_enabled_ = false;
 
-    // Force a closeSocket() after the write buffer is flushed if the close_type calls for it or if
-    // no delayed close timeout is set.
-    close_after_flush_ = !delayed_close_timeout_set || type == ConnectionCloseType::FlushWrite;
-
-    // Create and activate a timer which will immediately close the connection if triggered.
-    // A config value of 0 disables the timeout.
     if (delayed_close_timeout_set) {
-      delayed_close_timer_ = dispatcher_.createTimer([this]() -> void { onDelayedCloseTimeout(); });
-      ENVOY_CONN_LOG(debug, "setting delayed close timer with timeout {} ms", *this,
-                     delayedCloseTimeout().count());
-      delayed_close_timer_->enableTimer(delayedCloseTimeout());
+      initializeDelayedCloseTimer();
+      delayed_close_state_ = (type == ConnectionCloseType::FlushWrite)
+                                 ? DelayedCloseState::CloseAfterFlush
+                                 : DelayedCloseState::CloseAfterFlushAndTimeout;
+    } else {
+      delayed_close_state_ = DelayedCloseState::CloseAfterFlush;
     }
 
     file_event_->setEnabled(Event::FileReadyType::Write |
@@ -162,7 +166,7 @@ void ConnectionImpl::close(ConnectionCloseType type) {
 Connection::State ConnectionImpl::state() const {
   if (!ioHandle().isOpen()) {
     return State::Closed;
-  } else if (delayed_close_) {
+  } else if (inDelayedClose()) {
     return State::Closing;
   } else {
     return State::Open;
@@ -529,6 +533,12 @@ void ConnectionImpl::onWriteReady() {
     }
   }
 
+  // Disable the delayed close timer since data is still being flushed. The timer should only
+  // trigger after a delayedCloseTimeout() period of inactivity.
+  if (delayed_close_timer_ != nullptr) {
+    delayed_close_timer_->disableTimer();
+  }
+
   IoResult result = transport_socket_->doWrite(*write_buffer_, write_end_stream_);
   ASSERT(!result.end_stream_read_); // The interface guarantees that only read operations set this.
   uint64_t new_buffer_size = write_buffer_->length();
@@ -539,16 +549,28 @@ void ConnectionImpl::onWriteReady() {
     // write callback. This can happen if we manage to complete the SSL handshake in the write
     // callback, raise a connected event, and close the connection.
     closeSocket(ConnectionEvent::RemoteClose);
-  } else if ((close_after_flush_ && new_buffer_size == 0) || bothSidesHalfClosed()) {
+  } else if ((inDelayedClose() && new_buffer_size == 0) || bothSidesHalfClosed()) {
     ENVOY_CONN_LOG(debug, "write flush complete", *this);
-    closeSocket(ConnectionEvent::LocalClose);
-  } else if (result.action_ == PostIoAction::KeepOpen && result.bytes_processed_ > 0) {
-    for (BytesSentCb& cb : bytes_sent_callbacks_) {
-      cb(result.bytes_processed_);
-
-      // If a callback closes the socket, stop iterating.
-      if (!ioHandle().isOpen()) {
-        return;
+    if (delayed_close_state_ == DelayedCloseState::CloseAfterFlushAndTimeout) {
+      ASSERT(delayed_close_timer_ != nullptr);
+      delayed_close_timer_->enableTimer(delayedCloseTimeout());
+    } else {
+      ASSERT(bothSidesHalfClosed() || delayed_close_state_ == DelayedCloseState::CloseAfterFlush);
+      closeSocket(ConnectionEvent::LocalClose);
+    }
+  } else {
+    ASSERT(result.action_ == PostIoAction::KeepOpen);
+    if (delayed_close_timer_ != nullptr) {
+      delayed_close_timer_->enableTimer(delayedCloseTimeout());
+    }
+    if (result.bytes_processed_ > 0) {
+      for (BytesSentCb& cb : bytes_sent_callbacks_) {
+        cb(result.bytes_processed_);
+
+        // If a callback closes the socket, stop iterating.
+        if (!ioHandle().isOpen()) {
+          return;
+        }
       }
     }
   }
@@ -587,13 +609,22 @@ bool ConnectionImpl::bothSidesHalfClosed() {
 }
 
 void ConnectionImpl::onDelayedCloseTimeout() {
+  delayed_close_timer_.reset(nullptr);
   ENVOY_CONN_LOG(debug, "triggered delayed close", *this);
   if (connection_stats_ != nullptr && connection_stats_->delayed_close_timeouts_ != nullptr) {
     connection_stats_->delayed_close_timeouts_->inc();
   }
   closeSocket(ConnectionEvent::LocalClose);
 }
 
+void ConnectionImpl::initializeDelayedCloseTimer() {
+  const auto timeout = delayedCloseTimeout().count();
+  ASSERT(delayed_close_timer_ == nullptr && timeout > 0);
+  delayed_close_timer_ = dispatcher_.createTimer([this]() -> void { onDelayedCloseTimeout(); });
+  ENVOY_CONN_LOG(debug, "setting delayed close timer with timeout {} ms", *this, timeout);
+  delayed_close_timer_->enableTimer(delayedCloseTimeout());
+}
+
 absl::string_view ConnectionImpl::transportFailureReason() const {
   return transport_socket_->failureReason();
 }

diff --git a/source/common/network/connection_impl.h b/source/common/network/connection_impl.h
@@ -167,16 +167,22 @@ class ConnectionImpl : public virtual Connection,
   // Callback issued when a delayed close timeout triggers.
   void onDelayedCloseTimeout();
 
+  void initializeDelayedCloseTimer();
+  bool inDelayedClose() const { return delayed_close_state_ != DelayedCloseState::None; }
+
   static std::atomic<uint64_t> next_global_id_;
 
+  // States associated with delayed closing of the connection (i.e., when the underlying socket is
+  // not immediately close()d as a result of a ConnectionImpl::close()).
+  enum class DelayedCloseState { None, CloseAfterFlush, CloseAfterFlushAndTimeout };
+  DelayedCloseState delayed_close_state_{DelayedCloseState::None};
+
   Event::Dispatcher& dispatcher_;
   const uint64_t id_;
   Event::TimerPtr delayed_close_timer_;
   std::list<ConnectionCallbacks*> callbacks_;
   std::list<BytesSentCb> bytes_sent_callbacks_;
   bool read_enabled_{true};
-  bool close_after_flush_{false};
-  bool delayed_close_{false};
   bool above_high_watermark_{false};
   bool detect_early_close_{true};
   bool enable_half_close_{false};