diff --git a/ci/do_ci.sh b/ci/do_ci.sh index 58a7bdab2..792f617a3 100755 --- a/ci/do_ci.sh +++ b/ci/do_ci.sh @@ -40,7 +40,7 @@ function do_clang_tidy() { function do_unit_test_coverage() { export TEST_TARGETS="//test/... -//test:python_test" - export COVERAGE_THRESHOLD=94.0 + export COVERAGE_THRESHOLD=94.3 echo "bazel coverage build with tests ${TEST_TARGETS}" test/run_nighthawk_bazel_coverage.sh ${TEST_TARGETS} exit 0 diff --git a/source/client/benchmark_client_impl.cc b/source/client/benchmark_client_impl.cc index 53428424a..faa564583 100644 --- a/source/client/benchmark_client_impl.cc +++ b/source/client/benchmark_client_impl.cc @@ -107,9 +107,23 @@ BenchmarkClientHttpImpl::BenchmarkClientHttpImpl( } void BenchmarkClientHttpImpl::terminate() { - if (pool() != nullptr) { - pool()->addDrainedCallback([this]() -> void { dispatcher_.exit(); }); - pool()->drainConnections(); + if (pool() != nullptr && pool()->hasActiveConnections()) { + // We don't report what happens after this call in the output, but latencies may still be + // reported via callbacks. This may happen after a long time (60s), which HdrHistogram can't + // track the way we configure it today, as that exceeds the max that it can record. + // No harm is done, but it does result in log lines warning about it. Avoid that, by + // disabling latency measurement here. + setShouldMeasureLatencies(false); + pool()->addDrainedCallback([this]() -> void { + drain_timer_->disableTimer(); + dispatcher_.exit(); + }); + // Set up a timer with a callback which caps the time we wait for the pool to drain. + drain_timer_ = dispatcher_.createTimer([this]() -> void { + ENVOY_LOG(info, "Wait for the connection pool drain timed out, proceeding to hard shutdown."); + dispatcher_.exit(); + }); + drain_timer_->enableTimer(5s); dispatcher_.run(Envoy::Event::Dispatcher::RunType::RunUntilExit); } } diff --git a/source/client/benchmark_client_impl.h b/source/client/benchmark_client_impl.h index 48a174cdd..442217b28 100644 --- a/source/client/benchmark_client_impl.h +++ b/source/client/benchmark_client_impl.h @@ -165,6 +165,7 @@ class BenchmarkClientHttpImpl : public BenchmarkClient, const RequestGenerator request_generator_; const bool provide_resource_backpressure_; const std::string latency_response_header_name_; + Envoy::Event::TimerPtr drain_timer_; }; } // namespace Client diff --git a/test/benchmark_http_client_test.cc b/test/benchmark_http_client_test.cc index a429a1863..b473db323 100644 --- a/test/benchmark_http_client_test.cc +++ b/test/benchmark_http_client_test.cc @@ -216,7 +216,7 @@ class BenchmarkClientHttpTest : public Test { int worker_number_{0}; Client::BenchmarkClientStatistic statistic_; std::shared_ptr default_header_map_; -}; // namespace Nighthawk +}; TEST_F(BenchmarkClientHttpTest, BasicTestH1200) { response_code_ = "200"; @@ -419,4 +419,28 @@ TEST_F(BenchmarkClientHttpTest, RequestGeneratorProvidingDifferentPathsSendsRequ &expected_requests); EXPECT_EQ(2, getCounter("http_2xx")); } + +TEST_F(BenchmarkClientHttpTest, DrainTimeoutFires) { + RequestGenerator default_request_generator = getDefaultRequestGenerator(); + setupBenchmarkClient(default_request_generator); + EXPECT_CALL(pool_, newStream(_, _)) + .WillOnce( + [this](Envoy::Http::ResponseDecoder& decoder, Envoy::Http::ConnectionPool::Callbacks&) + -> Envoy::Http::ConnectionPool::Cancellable* { + // The decoder self-terminates in normal operation, but in this test that won't + // happen. Se we delete it ourselves. Note that we run our integration test with + // asan, so any leaks in real usage ought to be caught there. + delete &decoder; + client_->terminate(); + return nullptr; + }); + EXPECT_CALL(pool_, hasActiveConnections()).WillOnce([]() -> bool { return true; }); + EXPECT_CALL(pool_, addDrainedCallback(_)); + // We don't expect the callback that we pass here to fire. + client_->tryStartRequest([](bool, bool) { EXPECT_TRUE(false); }); + // To get past this, the drain timeout within the benchmark client must execute. + dispatcher_->run(Envoy::Event::Dispatcher::RunType::Block); + EXPECT_EQ(0, getCounter("http_2xx")); +} + } // namespace Nighthawk diff --git a/test/integration/configurations/nighthawk_https_origin.yaml b/test/integration/configurations/nighthawk_https_origin.yaml index 3423897e3..152f75fe6 100644 --- a/test/integration/configurations/nighthawk_https_origin.yaml +++ b/test/integration/configurations/nighthawk_https_origin.yaml @@ -24,6 +24,7 @@ static_resources: domains: - "*" http_filters: + - name: dynamic-delay - name: test-server typed_config: "@type": type.googleapis.com/nighthawk.server.ResponseOptions diff --git a/test/integration/test_integration_basics.py b/test/integration/test_integration_basics.py index 630095abb..ea3bf4e43 100644 --- a/test/integration/test_integration_basics.py +++ b/test/integration/test_integration_basics.py @@ -794,3 +794,26 @@ def test_client_cli_bad_uri(http_test_server_fixture): expect_failure=True, as_json=False) assert "Invalid target URI" in err + + +@pytest.mark.parametrize('server_config', + ["nighthawk/test/integration/configurations/nighthawk_https_origin.yaml"]) +def test_drain(https_test_server_fixture): + """Test that the pool drain timeout is effective, and we terminate in a timely fashion. + + Sets up the test server to delay replies 100 seconds. Our execution will only last 3 seconds, so we + expect to observe no replies. Termination should be cut short by the drain timeout, which means + that we should have results in approximately execution duration + drain timeout = 8 seconds. + (the pool drain timeout is hard coded to 5 seconds as of writing this). + """ + t0 = time.time() + parsed_json, _ = https_test_server_fixture.runNighthawkClient([ + https_test_server_fixture.getTestServerRootUri(), "--rps", "100", "--duration", "3", + "--request-header", "x-nighthawk-test-server-config: {static_delay: \"100s\"}" + ]) + t1 = time.time() + time_delta = t1 - t0 + counters = https_test_server_fixture.getNighthawkCounterMapFromJson(parsed_json) + assert time_delta < 40 # *lots* of slack to avoid failure in slow CI executions. + asserts.assertCounterGreaterEqual(counters, "upstream_cx_http1_total", 1) + asserts.assertNotIn("benchmark.http_2xx", counters)