From 8f1fdf2253477c632912a2b29eacb4aaaa434352 Mon Sep 17 00:00:00 2001 From: Otto van der Schaaf Date: Fri, 12 Feb 2021 23:29:05 +0100 Subject: [PATCH 1/6] Fix potential hangs on exit & cap drain wait time - don't set up a drain callback when there are no active connections - set up a timer that will cap the amount of time we wait for the pool to drain. - disable latency measurement when commencing the drain procedure, as by that time we are no longer interested in it, and in particular don't want to hear about any warnings issued by the Statistic implementation about recorded values being too large. Fixes #627 Signed-off-by: Otto van der Schaaf --- source/client/benchmark_client_impl.cc | 21 +++++++++++++++++++-- source/client/benchmark_client_impl.h | 1 + 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/source/client/benchmark_client_impl.cc b/source/client/benchmark_client_impl.cc index 53428424a..8c9875132 100644 --- a/source/client/benchmark_client_impl.cc +++ b/source/client/benchmark_client_impl.cc @@ -107,9 +107,26 @@ BenchmarkClientHttpImpl::BenchmarkClientHttpImpl( } void BenchmarkClientHttpImpl::terminate() { - if (pool() != nullptr) { - pool()->addDrainedCallback([this]() -> void { dispatcher_.exit(); }); + if (pool() != nullptr && pool()->hasActiveConnections()) { + // We don't report what happens after this call in the output, but latencies may still be + // reported via callbacks. This may happen after a long time (60s), which HdrHistogram can't + // track the way we configure it today, as that exceeds the max that it can record. + // No harm is done, but it does result in log lines warning about it. Avoid that, by + // disabling latency measurement here. + setShouldMeasureLatencies(false); + pool()->addDrainedCallback([this]() -> void { + // We no longer need to the drain timer. Today this dispatcher is done for, but in the future + // it may get re-used as another phase continues execution for the pool. Therefore disarm it. + drain_timer_->disableTimer(); + dispatcher_.exit(); + }); pool()->drainConnections(); + // Set up a timer with a callback which caps the time we wait for the pool to drain. + drain_timer_ = dispatcher_.createTimer([this]() -> void { + ENVOY_LOG(warn, "Connection pool drain timed out."); + dispatcher_.exit(); + }); + drain_timer_->enableTimer(5s); dispatcher_.run(Envoy::Event::Dispatcher::RunType::RunUntilExit); } } diff --git a/source/client/benchmark_client_impl.h b/source/client/benchmark_client_impl.h index 48a174cdd..442217b28 100644 --- a/source/client/benchmark_client_impl.h +++ b/source/client/benchmark_client_impl.h @@ -165,6 +165,7 @@ class BenchmarkClientHttpImpl : public BenchmarkClient, const RequestGenerator request_generator_; const bool provide_resource_backpressure_; const std::string latency_response_header_name_; + Envoy::Event::TimerPtr drain_timer_; }; } // namespace Client From 8e4be83fa16fb0c5fce2a75abd3a477f585c3ee8 Mon Sep 17 00:00:00 2001 From: Otto van der Schaaf Date: Sat, 13 Feb 2021 12:01:13 +0100 Subject: [PATCH 2/6] Add end to end test for the drain timeout Signed-off-by: Otto van der Schaaf --- .../nighthawk_https_origin.yaml | 1 + test/integration/test_integration_basics.py | 22 +++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/test/integration/configurations/nighthawk_https_origin.yaml b/test/integration/configurations/nighthawk_https_origin.yaml index 3423897e3..152f75fe6 100644 --- a/test/integration/configurations/nighthawk_https_origin.yaml +++ b/test/integration/configurations/nighthawk_https_origin.yaml @@ -24,6 +24,7 @@ static_resources: domains: - "*" http_filters: + - name: dynamic-delay - name: test-server typed_config: "@type": type.googleapis.com/nighthawk.server.ResponseOptions diff --git a/test/integration/test_integration_basics.py b/test/integration/test_integration_basics.py index 630095abb..987608975 100644 --- a/test/integration/test_integration_basics.py +++ b/test/integration/test_integration_basics.py @@ -794,3 +794,25 @@ def test_client_cli_bad_uri(http_test_server_fixture): expect_failure=True, as_json=False) assert "Invalid target URI" in err + +@pytest.mark.parametrize('server_config', + ["nighthawk/test/integration/configurations/nighthawk_https_origin.yaml"]) +def test_drain(https_test_server_fixture): + """Test that the pool drain timeout is effective, and we terminate in a timely fashion. + + Sets up the test server to delay replies 100 seconds. Our execution will only last 3 seconds, so we + expect to observe no replies. Termination should be cut short by the drain timeout, which means + that we should have results in approximately execution duration + drain timeout = 8 seconds. + (the pool drain timeout is hard coded to 5 seconds as of writing this). + """ + t0 = time.time() + parsed_json, _ = https_test_server_fixture.runNighthawkClient([ + https_test_server_fixture.getTestServerRootUri(), "--rps", "100", + "--duration", "3", "--request-header", "x-nighthawk-test-server-config: {static_delay: \"100s\"}" + ]) + t1 = time.time() + time_delta = t1 - t0 + counters = https_test_server_fixture.getNighthawkCounterMapFromJson(parsed_json) + assert time_delta < 20 # lots of slack to avoid failure in slow CI executions. + asserts.assertCounterGreaterEqual(counters, "upstream_cx_http1_total", 1) + asserts.assertNotIn("benchmark.http_2xx", counters) From 6ef7af0654902ebcea4a1a17996ab94b4013a6df Mon Sep 17 00:00:00 2001 From: Otto van der Schaaf Date: Sat, 13 Feb 2021 12:12:37 +0100 Subject: [PATCH 3/6] Fix format Signed-off-by: Otto van der Schaaf --- test/integration/test_integration_basics.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/test/integration/test_integration_basics.py b/test/integration/test_integration_basics.py index 987608975..54a08885b 100644 --- a/test/integration/test_integration_basics.py +++ b/test/integration/test_integration_basics.py @@ -795,11 +795,12 @@ def test_client_cli_bad_uri(http_test_server_fixture): as_json=False) assert "Invalid target URI" in err + @pytest.mark.parametrize('server_config', ["nighthawk/test/integration/configurations/nighthawk_https_origin.yaml"]) def test_drain(https_test_server_fixture): """Test that the pool drain timeout is effective, and we terminate in a timely fashion. - + Sets up the test server to delay replies 100 seconds. Our execution will only last 3 seconds, so we expect to observe no replies. Termination should be cut short by the drain timeout, which means that we should have results in approximately execution duration + drain timeout = 8 seconds. @@ -807,12 +808,12 @@ def test_drain(https_test_server_fixture): """ t0 = time.time() parsed_json, _ = https_test_server_fixture.runNighthawkClient([ - https_test_server_fixture.getTestServerRootUri(), "--rps", "100", - "--duration", "3", "--request-header", "x-nighthawk-test-server-config: {static_delay: \"100s\"}" + https_test_server_fixture.getTestServerRootUri(), "--rps", "100", "--duration", "3", + "--request-header", "x-nighthawk-test-server-config: {static_delay: \"100s\"}" ]) t1 = time.time() time_delta = t1 - t0 counters = https_test_server_fixture.getNighthawkCounterMapFromJson(parsed_json) - assert time_delta < 20 # lots of slack to avoid failure in slow CI executions. + assert time_delta < 20 # lots of slack to avoid failure in slow CI executions. asserts.assertCounterGreaterEqual(counters, "upstream_cx_http1_total", 1) asserts.assertNotIn("benchmark.http_2xx", counters) From 72ec88fea6b880f89632ae1ba2c150170445af65 Mon Sep 17 00:00:00 2001 From: Otto van der Schaaf Date: Sat, 13 Feb 2021 17:36:50 +0100 Subject: [PATCH 4/6] Add a unit test to cover the drain timeout path Signed-off-by: Otto van der Schaaf --- source/client/benchmark_client_impl.cc | 4 +--- test/benchmark_http_client_test.cc | 24 +++++++++++++++++++++++- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/source/client/benchmark_client_impl.cc b/source/client/benchmark_client_impl.cc index 8c9875132..3fddc4292 100644 --- a/source/client/benchmark_client_impl.cc +++ b/source/client/benchmark_client_impl.cc @@ -115,15 +115,13 @@ void BenchmarkClientHttpImpl::terminate() { // disabling latency measurement here. setShouldMeasureLatencies(false); pool()->addDrainedCallback([this]() -> void { - // We no longer need to the drain timer. Today this dispatcher is done for, but in the future - // it may get re-used as another phase continues execution for the pool. Therefore disarm it. drain_timer_->disableTimer(); dispatcher_.exit(); }); pool()->drainConnections(); // Set up a timer with a callback which caps the time we wait for the pool to drain. drain_timer_ = dispatcher_.createTimer([this]() -> void { - ENVOY_LOG(warn, "Connection pool drain timed out."); + ENVOY_LOG(info, "Wait for the connection pool drain timed out, proceeding to hard shutdown."); dispatcher_.exit(); }); drain_timer_->enableTimer(5s); diff --git a/test/benchmark_http_client_test.cc b/test/benchmark_http_client_test.cc index a429a1863..3347ba479 100644 --- a/test/benchmark_http_client_test.cc +++ b/test/benchmark_http_client_test.cc @@ -216,7 +216,7 @@ class BenchmarkClientHttpTest : public Test { int worker_number_{0}; Client::BenchmarkClientStatistic statistic_; std::shared_ptr default_header_map_; -}; // namespace Nighthawk +}; TEST_F(BenchmarkClientHttpTest, BasicTestH1200) { response_code_ = "200"; @@ -419,4 +419,26 @@ TEST_F(BenchmarkClientHttpTest, RequestGeneratorProvidingDifferentPathsSendsRequ &expected_requests); EXPECT_EQ(2, getCounter("http_2xx")); } + +TEST_F(BenchmarkClientHttpTest, DrainTimeoutFires) { + RequestGenerator default_request_generator = getDefaultRequestGenerator(); + setupBenchmarkClient(default_request_generator); + EXPECT_CALL(pool_, newStream(_, _)) + .WillOnce([this](Envoy::Http::ResponseDecoder&, Envoy::Http::ConnectionPool::Callbacks&) + -> Envoy::Http::ConnectionPool::Cancellable* { + // Now that there's an active stream, terminate the benchmark client. The benchmark client + // has to rely on the drain timeout to wrap up execution. + client_->terminate(); + return nullptr; + }); + EXPECT_CALL(pool_, hasActiveConnections()).WillOnce([]() -> bool { return true; }); + EXPECT_CALL(pool_, addDrainedCallback(_)); + EXPECT_CALL(pool_, drainConnections()); + // We don't expect the callback that we pass here to fire. + client_->tryStartRequest([](bool, bool) { EXPECT_TRUE(false); }); + // To get past this, the drain timeout within the benchmark client must execute. + dispatcher_->run(Envoy::Event::Dispatcher::RunType::Block); + EXPECT_EQ(0, getCounter("http_2xx")); +} + } // namespace Nighthawk From 4220d66a6b0b66305bb1ff3832da3f497b3fabc7 Mon Sep 17 00:00:00 2001 From: Otto van der Schaaf Date: Sat, 13 Feb 2021 18:52:06 +0100 Subject: [PATCH 5/6] - Bump the coverage threshold higher - Attempt to fix asan-detected leak - Add more timing slack to integration test, as CI needs it. Signed-off-by: Otto van der Schaaf --- ci/do_ci.sh | 2 +- test/benchmark_http_client_test.cc | 7 ++++++- test/integration/test_integration_basics.py | 2 +- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/ci/do_ci.sh b/ci/do_ci.sh index 58a7bdab2..792f617a3 100755 --- a/ci/do_ci.sh +++ b/ci/do_ci.sh @@ -40,7 +40,7 @@ function do_clang_tidy() { function do_unit_test_coverage() { export TEST_TARGETS="//test/... -//test:python_test" - export COVERAGE_THRESHOLD=94.0 + export COVERAGE_THRESHOLD=94.3 echo "bazel coverage build with tests ${TEST_TARGETS}" test/run_nighthawk_bazel_coverage.sh ${TEST_TARGETS} exit 0 diff --git a/test/benchmark_http_client_test.cc b/test/benchmark_http_client_test.cc index 3347ba479..c884bbd74 100644 --- a/test/benchmark_http_client_test.cc +++ b/test/benchmark_http_client_test.cc @@ -424,8 +424,13 @@ TEST_F(BenchmarkClientHttpTest, DrainTimeoutFires) { RequestGenerator default_request_generator = getDefaultRequestGenerator(); setupBenchmarkClient(default_request_generator); EXPECT_CALL(pool_, newStream(_, _)) - .WillOnce([this](Envoy::Http::ResponseDecoder&, Envoy::Http::ConnectionPool::Callbacks&) + .WillOnce([this](Envoy::Http::ResponseDecoder& decoder, + Envoy::Http::ConnectionPool::Callbacks& callbacks) -> Envoy::Http::ConnectionPool::Cancellable* { + decoders_.push_back(&decoder); + NiceMock stream_info; + callbacks.onPoolReady(stream_encoder_, Envoy::Upstream::HostDescriptionConstSharedPtr{}, + stream_info, {} /*absl::optional protocol*/); // Now that there's an active stream, terminate the benchmark client. The benchmark client // has to rely on the drain timeout to wrap up execution. client_->terminate(); diff --git a/test/integration/test_integration_basics.py b/test/integration/test_integration_basics.py index 54a08885b..ea3bf4e43 100644 --- a/test/integration/test_integration_basics.py +++ b/test/integration/test_integration_basics.py @@ -814,6 +814,6 @@ def test_drain(https_test_server_fixture): t1 = time.time() time_delta = t1 - t0 counters = https_test_server_fixture.getNighthawkCounterMapFromJson(parsed_json) - assert time_delta < 20 # lots of slack to avoid failure in slow CI executions. + assert time_delta < 40 # *lots* of slack to avoid failure in slow CI executions. asserts.assertCounterGreaterEqual(counters, "upstream_cx_http1_total", 1) asserts.assertNotIn("benchmark.http_2xx", counters) From dd04c9cf1b91171357387c09773e4a6e171e2799 Mon Sep 17 00:00:00 2001 From: Otto van der Schaaf Date: Sat, 13 Feb 2021 22:24:22 +0100 Subject: [PATCH 6/6] Fix leak in test. Remove superfluous drainConnections call. Signed-off-by: Otto van der Schaaf --- source/client/benchmark_client_impl.cc | 1 - test/benchmark_http_client_test.cc | 23 ++++++++++------------- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/source/client/benchmark_client_impl.cc b/source/client/benchmark_client_impl.cc index 3fddc4292..faa564583 100644 --- a/source/client/benchmark_client_impl.cc +++ b/source/client/benchmark_client_impl.cc @@ -118,7 +118,6 @@ void BenchmarkClientHttpImpl::terminate() { drain_timer_->disableTimer(); dispatcher_.exit(); }); - pool()->drainConnections(); // Set up a timer with a callback which caps the time we wait for the pool to drain. drain_timer_ = dispatcher_.createTimer([this]() -> void { ENVOY_LOG(info, "Wait for the connection pool drain timed out, proceeding to hard shutdown."); diff --git a/test/benchmark_http_client_test.cc b/test/benchmark_http_client_test.cc index c884bbd74..b473db323 100644 --- a/test/benchmark_http_client_test.cc +++ b/test/benchmark_http_client_test.cc @@ -424,21 +424,18 @@ TEST_F(BenchmarkClientHttpTest, DrainTimeoutFires) { RequestGenerator default_request_generator = getDefaultRequestGenerator(); setupBenchmarkClient(default_request_generator); EXPECT_CALL(pool_, newStream(_, _)) - .WillOnce([this](Envoy::Http::ResponseDecoder& decoder, - Envoy::Http::ConnectionPool::Callbacks& callbacks) - -> Envoy::Http::ConnectionPool::Cancellable* { - decoders_.push_back(&decoder); - NiceMock stream_info; - callbacks.onPoolReady(stream_encoder_, Envoy::Upstream::HostDescriptionConstSharedPtr{}, - stream_info, {} /*absl::optional protocol*/); - // Now that there's an active stream, terminate the benchmark client. The benchmark client - // has to rely on the drain timeout to wrap up execution. - client_->terminate(); - return nullptr; - }); + .WillOnce( + [this](Envoy::Http::ResponseDecoder& decoder, Envoy::Http::ConnectionPool::Callbacks&) + -> Envoy::Http::ConnectionPool::Cancellable* { + // The decoder self-terminates in normal operation, but in this test that won't + // happen. Se we delete it ourselves. Note that we run our integration test with + // asan, so any leaks in real usage ought to be caught there. + delete &decoder; + client_->terminate(); + return nullptr; + }); EXPECT_CALL(pool_, hasActiveConnections()).WillOnce([]() -> bool { return true; }); EXPECT_CALL(pool_, addDrainedCallback(_)); - EXPECT_CALL(pool_, drainConnections()); // We don't expect the callback that we pass here to fire. client_->tryStartRequest([](bool, bool) { EXPECT_TRUE(false); }); // To get past this, the drain timeout within the benchmark client must execute.