diff --git a/docs/root/version_history/current.rst b/docs/root/version_history/current.rst index bb04894db4783..3ea3bcdfa5c1b 100644 --- a/docs/root/version_history/current.rst +++ b/docs/root/version_history/current.rst @@ -13,6 +13,8 @@ Bug Fixes --------- *Changes expected to improve the state of the world and are unlikely to have negative effects* +* active http health checks: properly handles HTTP/2 GOAWAY frames from the upstream. Previously a GOAWAY frame due to a graceful listener drain could cause improper failed health checks due to streams being refused by the upstream on a connection that is going away. To revert to old GOAWAY handling behavior, set the runtime feature `envoy.reloadable_features.health_check.graceful_goaway_handling` to false. + Removed Config or Runtime ------------------------- *Normally occurs at the end of the* :ref:`deprecation period ` diff --git a/source/common/http/http2/codec_impl.cc b/source/common/http/http2/codec_impl.cc index 74a565b279164..f53aff523b691 100644 --- a/source/common/http/http2/codec_impl.cc +++ b/source/common/http/http2/codec_impl.cc @@ -716,6 +716,15 @@ void ConnectionImpl::shutdownNotice() { } } +Status ConnectionImpl::protocolErrorForTest() { + int rc = nghttp2_submit_goaway(session_, NGHTTP2_FLAG_NONE, + nghttp2_session_get_last_proc_stream_id(session_), + NGHTTP2_PROTOCOL_ERROR, nullptr, 0); + ASSERT(rc == 0); + + return sendPendingFrames(); +} + Status ConnectionImpl::onBeforeFrameReceived(const nghttp2_frame_hd* hd) { ENVOY_CONN_LOG(trace, "about to recv frame type={}, flags={}", connection_, static_cast(hd->type), static_cast(hd->flags)); diff --git a/source/common/http/http2/codec_impl.h b/source/common/http/http2/codec_impl.h index 023c32c77e136..3e51d57eba5ae 100644 --- a/source/common/http/http2/codec_impl.h +++ b/source/common/http/http2/codec_impl.h @@ -105,6 +105,7 @@ class ConnectionImpl : public virtual Connection, protected Logger::LoggableaddConnectionCallbacks(connection_callback_impl_); + client_->setCodecConnectionCallbacks(http_connection_callback_impl_); expect_reset_ = false; + reuse_connection_ = parent_.reuse_connection_; } Http::RequestEncoder* request_encoder = &client_->newStream(*this); request_encoder->getStream().addCallbacks(*this); + request_in_flight_ = true; const auto request_headers = Http::createHeaderMap( {{Http::Headers::get().Method, "GET"}, @@ -284,15 +287,51 @@ void HttpHealthCheckerImpl::HttpActiveHealthCheckSession::onInterval() { void HttpHealthCheckerImpl::HttpActiveHealthCheckSession::onResetStream(Http::StreamResetReason, absl::string_view) { + request_in_flight_ = false; + ENVOY_CONN_LOG(debug, "connection/stream error health_flags={}", *client_, + HostUtility::healthFlagsToString(*host_)); if (expect_reset_) { return; } - ENVOY_CONN_LOG(debug, "connection/stream error health_flags={}", *client_, - HostUtility::healthFlagsToString(*host_)); + if (client_ && !reuse_connection_) { + client_->close(); + } + handleFailure(envoy::data::core::v3::NETWORK); } +void HttpHealthCheckerImpl::HttpActiveHealthCheckSession::onGoAway( + Http::GoAwayErrorCode error_code) { + ENVOY_CONN_LOG(debug, "connection going away goaway_code={}, health_flags={}", *client_, + error_code, HostUtility::healthFlagsToString(*host_)); + + // Runtime guard around graceful handling of NO_ERROR GOAWAY handling. The old behavior is to + // ignore GOAWAY completely. + if (!parent_.runtime_.snapshot().runtimeFeatureEnabled( + "envoy.reloadable_features.health_check.graceful_goaway_handling")) { + return; + } + + if (request_in_flight_ && error_code == Http::GoAwayErrorCode::NoError) { + // The server is starting a graceful shutdown. Allow the in flight request + // to finish without treating this as a health check error, and then + // reconnect. + reuse_connection_ = false; + return; + } + + if (request_in_flight_) { + // Record this as a failed health check. + handleFailure(envoy::data::core::v3::NETWORK); + } + + if (client_) { + expect_reset_ = true; + client_->close(); + } +} + HttpHealthCheckerImpl::HttpActiveHealthCheckSession::HealthCheckResult HttpHealthCheckerImpl::HttpActiveHealthCheckSession::healthCheckResult() { uint64_t response_code = Http::Utility::getResponseStatus(*response_headers_); @@ -323,6 +362,8 @@ HttpHealthCheckerImpl::HttpActiveHealthCheckSession::healthCheckResult() { } void HttpHealthCheckerImpl::HttpActiveHealthCheckSession::onResponseComplete() { + request_in_flight_ = false; + switch (healthCheckResult()) { case HealthCheckResult::Succeeded: handleSuccess(false); @@ -350,7 +391,7 @@ bool HttpHealthCheckerImpl::HttpActiveHealthCheckSession::shouldClose() const { return false; } - if (!parent_.reuse_connection_) { + if (!reuse_connection_) { return true; } @@ -380,6 +421,7 @@ bool HttpHealthCheckerImpl::HttpActiveHealthCheckSession::shouldClose() const { } void HttpHealthCheckerImpl::HttpActiveHealthCheckSession::onTimeout() { + request_in_flight_ = false; if (client_) { host_->setActiveHealthFailureType(Host::ActiveHealthFailureType::TIMEOUT); ENVOY_CONN_LOG(debug, "connection/stream timeout health_flags={}", *client_, diff --git a/source/common/upstream/health_checker_impl.h b/source/common/upstream/health_checker_impl.h index 8066cc9e9b8fa..3230866c306e1 100644 --- a/source/common/upstream/health_checker_impl.h +++ b/source/common/upstream/health_checker_impl.h @@ -106,6 +106,7 @@ class HttpHealthCheckerImpl : public HealthCheckerImplBase { void onBelowWriteBufferLowWatermark() override {} void onEvent(Network::ConnectionEvent event); + void onGoAway(Http::GoAwayErrorCode error_code); class ConnectionCallbackImpl : public Network::ConnectionCallbacks { public: @@ -119,7 +120,18 @@ class HttpHealthCheckerImpl : public HealthCheckerImplBase { HttpActiveHealthCheckSession& parent_; }; + class HttpConnectionCallbackImpl : public Http::ConnectionCallbacks { + public: + HttpConnectionCallbackImpl(HttpActiveHealthCheckSession& parent) : parent_(parent) {} + // Http::ConnectionCallbacks + void onGoAway(Http::GoAwayErrorCode error_code) override { parent_.onGoAway(error_code); } + + private: + HttpActiveHealthCheckSession& parent_; + }; + ConnectionCallbackImpl connection_callback_impl_{*this}; + HttpConnectionCallbackImpl http_connection_callback_impl_{*this}; HttpHealthCheckerImpl& parent_; Http::CodecClientPtr client_; Http::ResponseHeaderMapPtr response_headers_; @@ -127,6 +139,8 @@ class HttpHealthCheckerImpl : public HealthCheckerImplBase { const Http::Protocol protocol_; Network::SocketAddressProviderSharedPtr local_address_provider_; bool expect_reset_{}; + bool reuse_connection_ = false; + bool request_in_flight_ = false; }; using HttpActiveHealthCheckSessionPtr = std::unique_ptr; diff --git a/test/common/http/http2/codec_impl_test.cc b/test/common/http/http2/codec_impl_test.cc index a8ab0eaf146da..3daec9a7d43f9 100644 --- a/test/common/http/http2/codec_impl_test.cc +++ b/test/common/http/http2/codec_impl_test.cc @@ -349,6 +349,24 @@ TEST_P(Http2CodecImplTest, ShutdownNotice) { response_encoder_->encodeHeaders(response_headers, true); } +TEST_P(Http2CodecImplTest, ProtocolErrorForTest) { + initialize(); + EXPECT_EQ(absl::nullopt, request_encoder_->http1StreamEncoderOptions()); + + TestRequestHeaderMapImpl request_headers; + HttpTestUtility::addDefaultHeaders(request_headers); + EXPECT_CALL(request_decoder_, decodeHeaders_(_, true)); + EXPECT_TRUE(request_encoder_->encodeHeaders(request_headers, true).ok()); + + EXPECT_CALL(client_callbacks_, onGoAway(Http::GoAwayErrorCode::Other)); + + // We have to dynamic cast because protocolErrorForTest() is intentionally not on the + // Connection API. + ServerConnectionImpl* raw_server = dynamic_cast(server_.get()); + ASSERT(raw_server != nullptr); + EXPECT_EQ(StatusCode::CodecProtocolError, getStatusCode(raw_server->protocolErrorForTest())); +} + // 100 response followed by 200 results in a [decode100ContinueHeaders, decodeHeaders] sequence. TEST_P(Http2CodecImplTest, ContinueHeaders) { initialize(); diff --git a/test/common/upstream/health_check_fuzz_test_utils.cc b/test/common/upstream/health_check_fuzz_test_utils.cc index 78b5feac68fd5..e67e5f7a6cf06 100644 --- a/test/common/upstream/health_check_fuzz_test_utils.cc +++ b/test/common/upstream/health_check_fuzz_test_utils.cc @@ -51,11 +51,12 @@ void HttpHealthCheckerImplTestBase::expectClientCreate( std::shared_ptr cluster{ new NiceMock()}; Event::MockDispatcher dispatcher_; - return new CodecClientForTest( + test_session.codec_client_ = new CodecClientForTest( Http::CodecClient::Type::HTTP1, std::move(conn_data.connection_), test_session.codec_, nullptr, Upstream::makeTestHost(cluster, "tcp://127.0.0.1:9000", dispatcher_.timeSource()), dispatcher_); + return test_session.codec_client_; })); } diff --git a/test/common/upstream/health_check_fuzz_test_utils.h b/test/common/upstream/health_check_fuzz_test_utils.h index 1074823314e74..a87eb51232f98 100644 --- a/test/common/upstream/health_check_fuzz_test_utils.h +++ b/test/common/upstream/health_check_fuzz_test_utils.h @@ -48,6 +48,7 @@ class HttpHealthCheckerImplTestBase : public HealthCheckerTestBase { Network::MockClientConnection* client_connection_{}; NiceMock request_encoder_; Http::ResponseDecoder* stream_response_callbacks_{}; + CodecClientForTest* codec_client_{}; }; using TestSessionPtr = std::unique_ptr; diff --git a/test/common/upstream/health_checker_impl_test.cc b/test/common/upstream/health_checker_impl_test.cc index 4c97ea1dd7dd6..32b673f63c378 100644 --- a/test/common/upstream/health_checker_impl_test.cc +++ b/test/common/upstream/health_checker_impl_test.cc @@ -137,6 +137,7 @@ class HttpHealthCheckerImplTest : public Event::TestUsingSimulatedTime, Network::MockClientConnection* client_connection_{}; NiceMock request_encoder_; Http::ResponseDecoder* stream_response_callbacks_{}; + CodecClientForTest* codec_client_{}; }; using TestSessionPtr = std::unique_ptr; @@ -176,6 +177,25 @@ class HttpHealthCheckerImplTest : public Event::TestUsingSimulatedTime, addCompletionCallback(); } + void setupHCHttp2() { + const std::string yaml = R"EOF( + timeout: 1s + interval: 1s + no_traffic_interval: 5s + interval_jitter: 1s + unhealthy_threshold: 1 + healthy_threshold: 1 + http_health_check: + service_name_matcher: + prefix: locations + path: /healthcheck + codec_client_type: Http2 + )EOF"; + + allocHealthChecker(yaml); + addCompletionCallback(); + } + void setupInitialJitter() { const std::string yaml = R"EOF( timeout: 1s @@ -561,10 +581,11 @@ class HttpHealthCheckerImplTest : public Event::TestUsingSimulatedTime, std::shared_ptr cluster{ new NiceMock()}; Event::MockDispatcher dispatcher_; - return new CodecClientForTest( + test_session.codec_client_ = new CodecClientForTest( Http::CodecClient::Type::HTTP1, std::move(conn_data.connection_), test_session.codec_, nullptr, Upstream::makeTestHost(cluster, "tcp://127.0.0.1:9000", simTime()), dispatcher_); + return test_session.codec_client_; })); } @@ -663,6 +684,35 @@ class HttpHealthCheckerImplTest : public Event::TestUsingSimulatedTime, MOCK_METHOD(void, onHostStatus, (HostSharedPtr host, HealthTransition changed_state)); + void expectUnhealthyTransition(size_t index, bool first_check) { + EXPECT_CALL(*this, onHostStatus(_, HealthTransition::Changed)); + EXPECT_CALL(*test_sessions_[index]->timeout_timer_, disableTimer()); + EXPECT_CALL(*test_sessions_[index]->interval_timer_, enableTimer(_, _)); + if (first_check) { + EXPECT_CALL(event_logger_, logUnhealthy(_, _, _, _)); + } + EXPECT_CALL(event_logger_, logEjectUnhealthy(_, _, _)); + } + + void expectHealthyTransition(size_t index) { + EXPECT_CALL(*this, onHostStatus(_, HealthTransition::Changed)); + EXPECT_CALL(*test_sessions_[index]->timeout_timer_, disableTimer()); + EXPECT_CALL(*test_sessions_[index]->interval_timer_, enableTimer(_, _)); + EXPECT_CALL(event_logger_, logAddHealthy(_, _, _)); + } + + void expectUnchanged(size_t index) { + EXPECT_CALL(*this, onHostStatus(_, HealthTransition::Unchanged)); + EXPECT_CALL(*test_sessions_[index]->timeout_timer_, disableTimer()); + EXPECT_CALL(*test_sessions_[index]->interval_timer_, enableTimer(_, _)); + } + + void expectChangePending(size_t index) { + EXPECT_CALL(*this, onHostStatus(_, HealthTransition::ChangePending)); + EXPECT_CALL(*test_sessions_[index]->timeout_timer_, disableTimer()); + EXPECT_CALL(*test_sessions_[index]->interval_timer_, enableTimer(_, _)); + } + std::vector test_sessions_; std::shared_ptr health_checker_; std::list connection_index_{}; @@ -2612,6 +2662,290 @@ TEST_F(HttpHealthCheckerImplTest, NoTransportSocketMatchCriteria) { health_checker_->start(); } +// Test receiving GOAWAY (error) is interpreted as connection close event. +TEST_F(HttpHealthCheckerImplTest, GoAwayErrorProbeInProgress) { + // FailureType::Network will be issued, it will render host unhealthy only if unhealthy_threshold + // is reached. + setupNoServiceValidationHCWithHttp2(); + EXPECT_CALL(runtime_.snapshot_, featureEnabled("health_check.verify_cluster", 100)) + .WillRepeatedly(Return(false)); + + cluster_->prioritySet().getMockHostSet(0)->hosts_ = { + makeTestHost(cluster_->info_, "tcp://127.0.0.1:80", simTime())}; + cluster_->info_->stats().upstream_cx_total_.inc(); + expectSessionCreate(); + expectStreamCreate(0); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_, _)); + health_checker_->start(); + + // We start off as healthy, and should continue to be healthy. + expectUnchanged(0); + respond(0, "200", false, false, true, false, {}, false); + EXPECT_EQ(Host::Health::Healthy, cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->health()); + + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_, _)); + expectStreamCreate(0); + test_sessions_[0]->interval_timer_->invokeCallback(); + + // GOAWAY with non-NO_ERROR code will result in a healthcheck failure and the + // connection closing. Status is unchanged because unhealthy_threshold is 2. + expectChangePending(0); + EXPECT_CALL( + runtime_.snapshot_, + runtimeFeatureEnabled("envoy.reloadable_features.health_check.graceful_goaway_handling")) + .WillOnce(Return(true)); + test_sessions_[0]->codec_client_->raiseGoAway(Http::GoAwayErrorCode::Other); + EXPECT_EQ(Host::Health::Healthy, cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->health()); + + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_, _)); + expectClientCreate(0); + expectStreamCreate(0); + + test_sessions_[0]->interval_timer_->invokeCallback(); + + // GOAWAY with non-NO_ERROR code will result in a healthcheck failure and the + // connection closing. This time it goes unhealthy. + expectUnhealthyTransition(0, false); + EXPECT_CALL( + runtime_.snapshot_, + runtimeFeatureEnabled("envoy.reloadable_features.health_check.graceful_goaway_handling")) + .WillOnce(Return(true)); + test_sessions_[0]->codec_client_->raiseGoAway(Http::GoAwayErrorCode::Other); + EXPECT_EQ(Host::Health::Unhealthy, + cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->health()); + EXPECT_TRUE(cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->healthFlagGet( + Host::HealthFlag::FAILED_ACTIVE_HC)); +} + +// Test receiving GOAWAY (no error) is handled gracefully while a check is in progress. +TEST_F(HttpHealthCheckerImplTest, GoAwayProbeInProgress) { + setupHCHttp2(); + EXPECT_CALL(runtime_.snapshot_, featureEnabled("health_check.verify_cluster", 100)) + .WillRepeatedly(Return(false)); + cluster_->prioritySet().getMockHostSet(0)->hosts_ = { + makeTestHost(cluster_->info_, "tcp://127.0.0.1:80", simTime())}; + cluster_->info_->stats().upstream_cx_total_.inc(); + + expectSessionCreate(); + expectStreamCreate(0); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_, _)); + health_checker_->start(); + + EXPECT_CALL( + runtime_.snapshot_, + runtimeFeatureEnabled("envoy.reloadable_features.health_check.graceful_goaway_handling")) + .WillOnce(Return(true)); + // GOAWAY with NO_ERROR code during check should be handled gracefully. + test_sessions_[0]->codec_client_->raiseGoAway(Http::GoAwayErrorCode::NoError); + EXPECT_EQ(Host::Health::Healthy, cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->health()); + + expectUnchanged(0); + respond(0, "200", false, false, true, false, {}, false); + + // GOAWAY should cause a new connection to be created. + expectClientCreate(0); + expectStreamCreate(0); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_, _)); + test_sessions_[0]->interval_timer_->invokeCallback(); + + // Test host state hasn't changed. + expectUnchanged(0); + respond(0, "200", false, false, true, false, {}, false); + EXPECT_EQ(Host::Health::Healthy, cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->health()); +} + +// Test receiving GOAWAY (no error) closes connection after an in progress probe times outs. +TEST_F(HttpHealthCheckerImplTest, GoAwayProbeInProgressTimeout) { + setupHCHttp2(); + cluster_->prioritySet().getMockHostSet(0)->hosts_ = { + makeTestHost(cluster_->info_, "tcp://127.0.0.1:80", simTime())}; + EXPECT_CALL(runtime_.snapshot_, featureEnabled("health_check.verify_cluster", 100)) + .WillRepeatedly(Return(false)); + + expectSessionCreate(); + expectStreamCreate(0); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_, _)); + health_checker_->start(); + + EXPECT_CALL( + runtime_.snapshot_, + runtimeFeatureEnabled("envoy.reloadable_features.health_check.graceful_goaway_handling")) + .WillOnce(Return(true)); + test_sessions_[0]->codec_client_->raiseGoAway(Http::GoAwayErrorCode::NoError); + EXPECT_EQ(Host::Health::Healthy, cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->health()); + + // Unhealthy threshold is 1 so first timeout causes unhealthy + expectUnhealthyTransition(0, true); + test_sessions_[0]->timeout_timer_->invokeCallback(); + EXPECT_EQ(Host::Health::Unhealthy, + cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->health()); + + // GOAWAY should cause a new connection to be created. + expectClientCreate(0); + expectStreamCreate(0); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_, _)); + test_sessions_[0]->interval_timer_->invokeCallback(); + + // Host should go back to healthy after a successful check. + expectHealthyTransition(0); + respond(0, "200", false, false, true, false, {}, false); + EXPECT_EQ(Host::Health::Healthy, cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->health()); +} + +// Test receiving GOAWAY (no error) closes connection after a stream reset. +TEST_F(HttpHealthCheckerImplTest, GoAwayProbeInProgressStreamReset) { + setupHCHttp2(); + cluster_->prioritySet().getMockHostSet(0)->hosts_ = { + makeTestHost(cluster_->info_, "tcp://127.0.0.1:80", simTime())}; + EXPECT_CALL(runtime_.snapshot_, featureEnabled("health_check.verify_cluster", 100)) + .WillRepeatedly(Return(false)); + + expectSessionCreate(); + expectStreamCreate(0); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_, _)); + health_checker_->start(); + + EXPECT_CALL( + runtime_.snapshot_, + runtimeFeatureEnabled("envoy.reloadable_features.health_check.graceful_goaway_handling")) + .WillOnce(Return(true)); + test_sessions_[0]->codec_client_->raiseGoAway(Http::GoAwayErrorCode::NoError); + EXPECT_EQ(Host::Health::Healthy, cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->health()); + + // Unhealthy threshold is 1 so first timeout causes unhealthy + expectUnhealthyTransition(0, true); + test_sessions_[0]->request_encoder_.stream_.resetStream(Http::StreamResetReason::RemoteReset); + EXPECT_EQ(Host::Health::Unhealthy, + cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->health()); + + // GOAWAY should cause a new connection to be created. + expectClientCreate(0); + expectStreamCreate(0); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_, _)); + test_sessions_[0]->interval_timer_->invokeCallback(); + + // Host should go back to healthy after a successful check. + expectHealthyTransition(0); + respond(0, "200", false, false, true, false, {}, false); + EXPECT_EQ(Host::Health::Healthy, cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->health()); +} + +// Test receiving GOAWAY (no error) and a connection close. +TEST_F(HttpHealthCheckerImplTest, GoAwayProbeInProgressConnectionClose) { + setupHCHttp2(); + cluster_->prioritySet().getMockHostSet(0)->hosts_ = { + makeTestHost(cluster_->info_, "tcp://127.0.0.1:80", simTime())}; + EXPECT_CALL(runtime_.snapshot_, featureEnabled("health_check.verify_cluster", 100)) + .WillRepeatedly(Return(false)); + + expectSessionCreate(); + expectStreamCreate(0); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_, _)); + health_checker_->start(); + + EXPECT_CALL( + runtime_.snapshot_, + runtimeFeatureEnabled("envoy.reloadable_features.health_check.graceful_goaway_handling")) + .WillOnce(Return(true)); + test_sessions_[0]->codec_client_->raiseGoAway(Http::GoAwayErrorCode::NoError); + EXPECT_EQ(Host::Health::Healthy, cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->health()); + + // Unhealthy threshold is 1 so first timeout causes unhealthy + expectUnhealthyTransition(0, true); + test_sessions_[0]->client_connection_->raiseEvent(Network::ConnectionEvent::RemoteClose); + EXPECT_EQ(Host::Health::Unhealthy, + cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->health()); + + // GOAWAY should cause a new connection to be created. + expectClientCreate(0); + expectStreamCreate(0); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_, _)); + test_sessions_[0]->interval_timer_->invokeCallback(); + + // Host should go back to healthy after a successful check. + expectHealthyTransition(0); + respond(0, "200", false, false, true, false, {}, false); + EXPECT_EQ(Host::Health::Healthy, cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->health()); +} + +// Test receiving GOAWAY between checks affects nothing. +TEST_F(HttpHealthCheckerImplTest, GoAwayBetweenChecks) { + setupHCHttp2(); + cluster_->prioritySet().getMockHostSet(0)->hosts_ = { + makeTestHost(cluster_->info_, "tcp://127.0.0.1:80", simTime())}; + EXPECT_CALL(runtime_.snapshot_, featureEnabled("health_check.verify_cluster", 100)) + .WillRepeatedly(Return(false)); + + expectSessionCreate(); + expectStreamCreate(0); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_, _)); + health_checker_->start(); + + expectUnchanged(0); + respond(0, "200", false, false, true, false, {}, false); + EXPECT_EQ(Host::Health::Healthy, cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->health()); + + EXPECT_CALL( + runtime_.snapshot_, + runtimeFeatureEnabled("envoy.reloadable_features.health_check.graceful_goaway_handling")) + .WillOnce(Return(true)); + // GOAWAY should cause a new connection to be created but should not affect health status. + test_sessions_[0]->codec_client_->raiseGoAway(Http::GoAwayErrorCode::Other); + EXPECT_EQ(Host::Health::Healthy, cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->health()); + + expectClientCreate(0); + expectStreamCreate(0); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_, _)); + test_sessions_[0]->interval_timer_->invokeCallback(); + + // Host should stay healthy. + expectUnchanged(0); + respond(0, "200", false, false, true, false, {}, false); + EXPECT_EQ(Host::Health::Healthy, cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->health()); +} + +// Smoke test that receiving GOAWAY (no error) is ignored when GOAWAY handling +// is disabled via runtime. This is based on the +// GoAwayProbeInProgressStreamReset test case. A single case gives us sufficient +// coverage due to the simplicity of the runtime check. +TEST_F(HttpHealthCheckerImplTest, GoAwayProbeInProgressStreamResetRuntimeDisabled) { + setupHCHttp2(); + cluster_->prioritySet().getMockHostSet(0)->hosts_ = { + makeTestHost(cluster_->info_, "tcp://127.0.0.1:80", simTime())}; + EXPECT_CALL(runtime_.snapshot_, featureEnabled("health_check.verify_cluster", 100)) + .WillRepeatedly(Return(false)); + + expectSessionCreate(); + expectStreamCreate(0); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_, _)); + health_checker_->start(); + + EXPECT_CALL( + runtime_.snapshot_, + runtimeFeatureEnabled("envoy.reloadable_features.health_check.graceful_goaway_handling")) + .WillOnce(Return(false)); + + test_sessions_[0]->codec_client_->raiseGoAway(Http::GoAwayErrorCode::NoError); + EXPECT_EQ(Host::Health::Healthy, cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->health()); + + // Unhealthy threshold is 1 so the first reset causes the host to go unhealthy. + expectUnhealthyTransition(0, true); + test_sessions_[0]->request_encoder_.stream_.resetStream(Http::StreamResetReason::RemoteReset); + EXPECT_EQ(Host::Health::Unhealthy, + cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->health()); + + // The new stream should be created on the same connection (old behavior since + // the new behavior is disabled via runtime). + expectStreamCreate(0); + EXPECT_CALL(*test_sessions_[0]->timeout_timer_, enableTimer(_, _)); + test_sessions_[0]->interval_timer_->invokeCallback(); + + // Host should go back to healthy after a successful check. + expectHealthyTransition(0); + respond(0, "200", false, false, true, false, {}, false); + EXPECT_EQ(Host::Health::Healthy, cluster_->prioritySet().getMockHostSet(0)->hosts_[0]->health()); +} + class TestProdHttpHealthChecker : public ProdHttpHealthCheckerImpl { public: using ProdHttpHealthCheckerImpl::ProdHttpHealthCheckerImpl; diff --git a/test/integration/fake_upstream.cc b/test/integration/fake_upstream.cc index 0b18b9e8c9b64..6d77d9b461173 100644 --- a/test/integration/fake_upstream.cc +++ b/test/integration/fake_upstream.cc @@ -377,6 +377,24 @@ void FakeHttpConnection::onGoAway(Http::GoAwayErrorCode code) { ENVOY_LOG(info, "FakeHttpConnection receives GOAWAY: ", code); } +void FakeHttpConnection::encodeGoAway() { + ASSERT(type_ == Type::HTTP2); + + shared_connection_.connection().dispatcher().post([this]() { codec_->goAway(); }); +} + +void FakeHttpConnection::encodeProtocolError() { + ASSERT(type_ == Type::HTTP2); + + Http::Http2::ServerConnectionImpl* codec = + dynamic_cast(codec_.get()); + ASSERT(codec != nullptr); + shared_connection_.connection().dispatcher().post([codec]() { + Http::Status status = codec->protocolErrorForTest(); + ASSERT(Http::getStatusCode(status) == Http::StatusCode::CodecProtocolError); + }); +} + AssertionResult FakeConnectionBase::waitForDisconnect(milliseconds timeout) { ENVOY_LOG(trace, "FakeConnectionBase waiting for disconnect"); absl::MutexLock lock(&lock_); diff --git a/test/integration/fake_upstream.h b/test/integration/fake_upstream.h index d68f86677764c..728b224101f8f 100644 --- a/test/integration/fake_upstream.h +++ b/test/integration/fake_upstream.h @@ -428,6 +428,12 @@ class FakeHttpConnection : public Http::ServerConnectionCallbacks, public FakeCo // Should only be called for HTTP2 void onGoAway(Http::GoAwayErrorCode code) override; + // Should only be called for HTTP2, sends a GOAWAY frame with NO_ERROR. + void encodeGoAway(); + + // Should only be called for HTTP2, sends a GOAWAY frame with ENHANCE_YOUR_CALM. + void encodeProtocolError(); + private: struct ReadFilter : public Network::ReadFilterBaseImpl { ReadFilter(FakeHttpConnection& parent) : parent_(parent) {} diff --git a/test/integration/health_check_integration_test.cc b/test/integration/health_check_integration_test.cc index a129e4947543e..f6a4038053511 100644 --- a/test/integration/health_check_integration_test.cc +++ b/test/integration/health_check_integration_test.cc @@ -15,8 +15,7 @@ namespace { // Integration tests for active health checking. // The tests fetch the cluster configuration using CDS in order to actively start health // checking after Envoy and the hosts are initialized. -class HealthCheckIntegrationTestBase : public Event::TestUsingSimulatedTime, - public HttpIntegrationTest { +class HealthCheckIntegrationTestBase : public HttpIntegrationTest { public: HealthCheckIntegrationTestBase( Network::Address::IpVersion ip_version, @@ -135,11 +134,11 @@ struct HttpHealthCheckIntegrationTestParams { FakeHttpConnection::Type upstream_protocol; }; -class HttpHealthCheckIntegrationTest +class HttpHealthCheckIntegrationTestBase : public testing::TestWithParam, public HealthCheckIntegrationTestBase { public: - HttpHealthCheckIntegrationTest() + HttpHealthCheckIntegrationTestBase() : HealthCheckIntegrationTestBase(GetParam().ip_version, GetParam().upstream_protocol) {} // Returns the 4 combinations for testing: @@ -201,11 +200,21 @@ class HttpHealthCheckIntegrationTest } }; +class HttpHealthCheckIntegrationTest : public Event::TestUsingSimulatedTime, + public HttpHealthCheckIntegrationTestBase {}; + INSTANTIATE_TEST_SUITE_P( IpHttpVersions, HttpHealthCheckIntegrationTest, testing::ValuesIn(HttpHealthCheckIntegrationTest::getHttpHealthCheckIntegrationTestParams()), HttpHealthCheckIntegrationTest::httpHealthCheckTestParamsToString); +class RealTimeHttpHealthCheckIntegrationTest : public HttpHealthCheckIntegrationTestBase {}; + +INSTANTIATE_TEST_SUITE_P( + IpHttpVersions, RealTimeHttpHealthCheckIntegrationTest, + testing::ValuesIn(HttpHealthCheckIntegrationTest::getHttpHealthCheckIntegrationTestParams()), + HttpHealthCheckIntegrationTest::httpHealthCheckTestParamsToString); + // Tests that a healthy endpoint returns a valid HTTP health check response. TEST_P(HttpHealthCheckIntegrationTest, SingleEndpointHealthyHttp) { const uint32_t cluster_idx = 0; @@ -254,6 +263,111 @@ TEST_P(HttpHealthCheckIntegrationTest, SingleEndpointTimeoutHttp) { EXPECT_EQ(1, test_server_->counter("cluster.cluster_1.health_check.failure")->value()); } +// Tests that health checking gracefully handles a NO_ERROR GOAWAY from the upstream. +TEST_P(HttpHealthCheckIntegrationTest, SingleEndpointGoAway) { + initialize(); + + // GOAWAY doesn't exist in HTTP1. + if (upstream_protocol_ == FakeHttpConnection::Type::HTTP1) { + return; + } + + const uint32_t cluster_idx = 0; + initHttpHealthCheck(cluster_idx); + + // Send a GOAWAY with NO_ERROR and then a 200. The health checker should allow the request + // to finish despite the GOAWAY. + clusters_[cluster_idx].host_fake_connection_->encodeGoAway(); + clusters_[cluster_idx].host_stream_->encodeHeaders( + Http::TestResponseHeaderMapImpl{{":status", "200"}}, true); + + test_server_->waitForCounterGe("cluster.cluster_1.health_check.success", 1); + EXPECT_EQ(1, test_server_->counter("cluster.cluster_1.health_check.success")->value()); + EXPECT_EQ(0, test_server_->counter("cluster.cluster_1.health_check.failure")->value()); + ASSERT_TRUE(clusters_[cluster_idx].host_fake_connection_->waitForDisconnect()); + + // Advance time to cause another health check. + timeSystem().advanceTimeWait(std::chrono::milliseconds(500)); + + ASSERT_TRUE(clusters_[cluster_idx].host_upstream_->waitForHttpConnection( + *dispatcher_, clusters_[cluster_idx].host_fake_connection_)); + ASSERT_TRUE(clusters_[cluster_idx].host_fake_connection_->waitForNewStream( + *dispatcher_, clusters_[cluster_idx].host_stream_)); + ASSERT_TRUE(clusters_[cluster_idx].host_stream_->waitForEndStream(*dispatcher_)); + + EXPECT_EQ(clusters_[cluster_idx].host_stream_->headers().getPathValue(), "/healthcheck"); + EXPECT_EQ(clusters_[cluster_idx].host_stream_->headers().getMethodValue(), "GET"); + EXPECT_EQ(clusters_[cluster_idx].host_stream_->headers().getHostValue(), + clusters_[cluster_idx].name_); + + clusters_[cluster_idx].host_stream_->encodeHeaders( + Http::TestResponseHeaderMapImpl{{":status", "200"}}, true); + + test_server_->waitForCounterGe("cluster.cluster_1.health_check.success", 2); + EXPECT_EQ(2, test_server_->counter("cluster.cluster_1.health_check.success")->value()); + EXPECT_EQ(0, test_server_->counter("cluster.cluster_1.health_check.failure")->value()); +} + +// Tests that health checking properly handles a GOAWAY with an error, followed +// by a reset. This test uses the real time system because it flakes with +// simulated time. +// The test goes through this sequence: +// 1) send a GOAWAY with a PROTOCOL_ERROR code from the upstream +// 2) waitForDisconnect on the health check connection +// 3) advance time to trigger another health check +// 4) wait for a new health check on a new connection. +// +// The flake was caused by the GOAWAY simultaneously causing the downstream +// health checker to close the connection (because of the GOAWAY) and the fake +// upstream to also close the connection (because of special handling for +// protocol errors). This meant that waitForDisconnect() could finish waiting +// before the health checker saw the GOAWAY and enabled the health check +// interval timer. This would cause simulated time to advance too early, and no +// followup health check would happen. Using real time solves this because then +// the ordering of advancing the time system and enabling the health check timer +// is inconsequential. +TEST_P(RealTimeHttpHealthCheckIntegrationTest, SingleEndpointGoAwayErroSingleEndpointGoAwayErrorr) { + initialize(); + + // GOAWAY doesn't exist in HTTP1. + if (upstream_protocol_ == FakeHttpConnection::Type::HTTP1) { + return; + } + + const uint32_t cluster_idx = 0; + initHttpHealthCheck(cluster_idx); + + // Send a GOAWAY with an error. The health checker should treat this as an + // error and cancel the request. + clusters_[cluster_idx].host_fake_connection_->encodeProtocolError(); + + ASSERT_TRUE(clusters_[cluster_idx].host_fake_connection_->waitForDisconnect()); + test_server_->waitForCounterGe("cluster.cluster_1.health_check.failure", 1); + EXPECT_EQ(0, test_server_->counter("cluster.cluster_1.health_check.success")->value()); + EXPECT_EQ(1, test_server_->counter("cluster.cluster_1.health_check.failure")->value()); + + // Advance time to cause another health check. + timeSystem().advanceTimeWait(std::chrono::milliseconds(500)); + + ASSERT_TRUE(clusters_[cluster_idx].host_upstream_->waitForHttpConnection( + *dispatcher_, clusters_[cluster_idx].host_fake_connection_)); + ASSERT_TRUE(clusters_[cluster_idx].host_fake_connection_->waitForNewStream( + *dispatcher_, clusters_[cluster_idx].host_stream_)); + ASSERT_TRUE(clusters_[cluster_idx].host_stream_->waitForEndStream(*dispatcher_)); + + EXPECT_EQ(clusters_[cluster_idx].host_stream_->headers().getPathValue(), "/healthcheck"); + EXPECT_EQ(clusters_[cluster_idx].host_stream_->headers().getMethodValue(), "GET"); + EXPECT_EQ(clusters_[cluster_idx].host_stream_->headers().getHostValue(), + clusters_[cluster_idx].name_); + + clusters_[cluster_idx].host_stream_->encodeHeaders( + Http::TestResponseHeaderMapImpl{{":status", "200"}}, true); + + test_server_->waitForCounterGe("cluster.cluster_1.health_check.success", 1); + EXPECT_EQ(1, test_server_->counter("cluster.cluster_1.health_check.success")->value()); + EXPECT_EQ(1, test_server_->counter("cluster.cluster_1.health_check.failure")->value()); +} + class TcpHealthCheckIntegrationTest : public testing::TestWithParam, public HealthCheckIntegrationTestBase { public: