envoyproxy · mum4k · Feb 15, 2021 · Feb 12, 2021 · Feb 13, 2021 · Feb 13, 2021
diff --git a/ci/do_ci.sh b/ci/do_ci.sh
@@ -40,7 +40,7 @@ function do_clang_tidy() {
 
 function do_unit_test_coverage() {
     export TEST_TARGETS="//test/... -//test:python_test"
-    export COVERAGE_THRESHOLD=94.0
+    export COVERAGE_THRESHOLD=94.3
     echo "bazel coverage build with tests ${TEST_TARGETS}"
     test/run_nighthawk_bazel_coverage.sh ${TEST_TARGETS}
     exit 0

diff --git a/source/client/benchmark_client_impl.cc b/source/client/benchmark_client_impl.cc
@@ -107,9 +107,23 @@ BenchmarkClientHttpImpl::BenchmarkClientHttpImpl(
 }
 
 void BenchmarkClientHttpImpl::terminate() {
-  if (pool() != nullptr) {
-    pool()->addDrainedCallback([this]() -> void { dispatcher_.exit(); });
-    pool()->drainConnections();
+  if (pool() != nullptr && pool()->hasActiveConnections()) {
+    // We don't report what happens after this call in the output, but latencies may still be
+    // reported via callbacks. This may happen after a long time (60s), which HdrHistogram can't
+    // track the way we configure it today, as that exceeds the max that it can record.
+    // No harm is done, but it does result in log lines warning about it. Avoid that, by
+    // disabling latency measurement here.
+    setShouldMeasureLatencies(false);
+    pool()->addDrainedCallback([this]() -> void {
+      drain_timer_->disableTimer();
+      dispatcher_.exit();
+    });
+    // Set up a timer with a callback which caps the time we wait for the pool to drain.
+    drain_timer_ = dispatcher_.createTimer([this]() -> void {
+      ENVOY_LOG(info, "Wait for the connection pool drain timed out, proceeding to hard shutdown.");
+      dispatcher_.exit();
+    });
+    drain_timer_->enableTimer(5s);
     dispatcher_.run(Envoy::Event::Dispatcher::RunType::RunUntilExit);
   }
 }

diff --git a/source/client/benchmark_client_impl.h b/source/client/benchmark_client_impl.h
@@ -165,6 +165,7 @@ class BenchmarkClientHttpImpl : public BenchmarkClient,
   const RequestGenerator request_generator_;
   const bool provide_resource_backpressure_;
   const std::string latency_response_header_name_;
+  Envoy::Event::TimerPtr drain_timer_;
 };
 
 } // namespace Client

diff --git a/test/benchmark_http_client_test.cc b/test/benchmark_http_client_test.cc
@@ -216,7 +216,7 @@ class BenchmarkClientHttpTest : public Test {
   int worker_number_{0};
   Client::BenchmarkClientStatistic statistic_;
   std::shared_ptr<Envoy::Http::RequestHeaderMap> default_header_map_;
-}; // namespace Nighthawk
+};
 
 TEST_F(BenchmarkClientHttpTest, BasicTestH1200) {
   response_code_ = "200";
@@ -419,4 +419,28 @@ TEST_F(BenchmarkClientHttpTest, RequestGeneratorProvidingDifferentPathsSendsRequ
                                                          &expected_requests);
   EXPECT_EQ(2, getCounter("http_2xx"));
 }
+
+TEST_F(BenchmarkClientHttpTest, DrainTimeoutFires) {
+  RequestGenerator default_request_generator = getDefaultRequestGenerator();
+  setupBenchmarkClient(default_request_generator);
+  EXPECT_CALL(pool_, newStream(_, _))
+      .WillOnce(
+          [this](Envoy::Http::ResponseDecoder& decoder, Envoy::Http::ConnectionPool::Callbacks&)
+              -> Envoy::Http::ConnectionPool::Cancellable* {
+            // The decoder self-terminates in normal operation, but in this test that won't
+            // happen. Se we delete it ourselves. Note that we run our integration test with
+            // asan, so any leaks in real usage ought to be caught there.
+            delete &decoder;
+            client_->terminate();
+            return nullptr;
+          });
+  EXPECT_CALL(pool_, hasActiveConnections()).WillOnce([]() -> bool { return true; });
+  EXPECT_CALL(pool_, addDrainedCallback(_));
+  // We don't expect the callback that we pass here to fire.
+  client_->tryStartRequest([](bool, bool) { EXPECT_TRUE(false); });
+  // To get past this, the drain timeout within the benchmark client must execute.
+  dispatcher_->run(Envoy::Event::Dispatcher::RunType::Block);
+  EXPECT_EQ(0, getCounter("http_2xx"));
+}
+
 } // namespace Nighthawk
diff --git a/test/integration/configurations/nighthawk_https_origin.yaml b/test/integration/configurations/nighthawk_https_origin.yaml
@@ -24,6 +24,7 @@ static_resources:
                       domains:
                         - "*"
                 http_filters:
+                  - name: dynamic-delay
                   - name: test-server
                     typed_config:
                       "@type": type.googleapis.com/nighthawk.server.ResponseOptions

diff --git a/test/integration/test_integration_basics.py b/test/integration/test_integration_basics.py
@@ -794,3 +794,26 @@ def test_client_cli_bad_uri(http_test_server_fixture):
                                                        expect_failure=True,
                                                        as_json=False)
   assert "Invalid target URI" in err
+
+
+@pytest.mark.parametrize('server_config',
+                         ["nighthawk/test/integration/configurations/nighthawk_https_origin.yaml"])
+def test_drain(https_test_server_fixture):
+  """Test that the pool drain timeout is effective, and we terminate in a timely fashion.
+
+  Sets up the test server to delay replies 100 seconds. Our execution will only last 3 seconds, so we
+  expect to observe no replies. Termination should be cut short by the drain timeout, which means
+  that we should have results in approximately execution duration + drain timeout = 8 seconds.
+  (the pool drain timeout is hard coded to 5 seconds as of writing this).
+  """
+  t0 = time.time()
+  parsed_json, _ = https_test_server_fixture.runNighthawkClient([
+      https_test_server_fixture.getTestServerRootUri(), "--rps", "100", "--duration", "3",
+      "--request-header", "x-nighthawk-test-server-config: {static_delay: \"100s\"}"
+  ])
+  t1 = time.time()
+  time_delta = t1 - t0
+  counters = https_test_server_fixture.getNighthawkCounterMapFromJson(parsed_json)
+  assert time_delta < 40  # *lots* of slack to avoid failure in slow CI executions.
+  asserts.assertCounterGreaterEqual(counters, "upstream_cx_http1_total", 1)
+  asserts.assertNotIn("benchmark.http_2xx", counters)