Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions source/common/http/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,16 @@ envoy_cc_library(
],
)

envoy_cc_library(
name = "http3_status_tracker",
srcs = ["http3_status_tracker.cc"],
hdrs = ["http3_status_tracker.h"],
deps = [
"//include/envoy/event:dispatcher_interface",
"//include/envoy/event:timer_interface",
],
)

envoy_cc_library(
name = "alternate_protocols",
srcs = ["alternate_protocols.cc"],
Expand All @@ -163,6 +173,7 @@ envoy_cc_library(
srcs = ["conn_pool_grid.cc"],
hdrs = ["conn_pool_grid.h"],
deps = [
":http3_status_tracker",
":mixed_conn_pool",
"//source/common/http/http3:conn_pool_lib",
],
Expand Down
18 changes: 14 additions & 4 deletions source/common/http/conn_pool_grid.cc
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,11 @@ void ConnectivityGrid::WrapperCallbacks::onConnectionAttemptReady(
host->hostname());
if (!grid_.isPoolHttp3(attempt->pool())) {
tcp_attempt_succeeded_ = true;
maybeMarkHttp3Broken();
} else {
ENVOY_LOG(trace, "Marking HTTP/3 confirmed for host '{}'.", grid_.host_->hostname());
grid_.markHttp3Confirmed();
}
maybeMarkHttp3Broken();

auto delete_this_on_return = attempt->removeFromList(connection_attempts_);
ConnectionPool::Callbacks* callbacks = inner_callbacks_;
Expand All @@ -133,7 +136,7 @@ void ConnectivityGrid::WrapperCallbacks::onConnectionAttemptReady(
void ConnectivityGrid::WrapperCallbacks::maybeMarkHttp3Broken() {
if (http3_attempt_failed_ && tcp_attempt_succeeded_) {
ENVOY_LOG(trace, "Marking HTTP/3 broken for host '{}'.", grid_.host_->hostname());
grid_.setIsHttp3Broken(true);
grid_.markHttp3Broken();
}
}

Expand Down Expand Up @@ -190,7 +193,8 @@ ConnectivityGrid::ConnectivityGrid(
std::chrono::milliseconds next_attempt_duration, ConnectivityOptions connectivity_options)
: dispatcher_(dispatcher), random_generator_(random_generator), host_(host),
priority_(priority), options_(options), transport_socket_options_(transport_socket_options),
state_(state), next_attempt_duration_(next_attempt_duration), time_source_(time_source) {
state_(state), next_attempt_duration_(next_attempt_duration), time_source_(time_source),
http3_status_tracker_(dispatcher_) {
// ProdClusterManagerFactory::allocateConnPool verifies the protocols are HTTP/1, HTTP/2 and
// HTTP/3.
// TODO(#15649) support v6/v4, WiFi/cellular.
Expand Down Expand Up @@ -243,7 +247,7 @@ ConnectionPool::Cancellable* ConnectivityGrid::newStream(Http::ResponseDecoder&
createNextPool();
}
PoolIterator pool = pools_.begin();
if (is_http3_broken_) {
if (http3_status_tracker_.isHttp3Broken()) {
ENVOY_LOG(trace, "HTTP/3 is broken to host '{}', skipping.", describePool(**pool),
host_->hostname());
// Since HTTP/3 is broken, presumably both pools have already been created so this
Expand Down Expand Up @@ -306,6 +310,12 @@ bool ConnectivityGrid::isPoolHttp3(const ConnectionPool::Instance& pool) {
return &pool == pools_.begin()->get();
}

bool ConnectivityGrid::isHttp3Broken() const { return http3_status_tracker_.isHttp3Broken(); }

void ConnectivityGrid::markHttp3Broken() { http3_status_tracker_.markHttp3Broken(); }

void ConnectivityGrid::markHttp3Confirmed() { http3_status_tracker_.markHttp3Confirmed(); }

void ConnectivityGrid::onDrainReceived() {
// Don't do any work under the stack of ~ConnectivityGrid()
if (destroying_) {
Expand Down
16 changes: 13 additions & 3 deletions source/common/http/conn_pool_grid.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#pragma once

#include "common/http/conn_pool_base.h"
#include "common/http/http3_status_tracker.h"

#include "absl/container/flat_hash_map.h"

Expand Down Expand Up @@ -150,8 +151,17 @@ class ConnectivityGrid : public ConnectionPool::Instance,
// Returns true if pool is the grid's HTTP/3 connection pool.
bool isPoolHttp3(const ConnectionPool::Instance& pool);

bool isHttp3Broken() const { return is_http3_broken_; }
void setIsHttp3Broken(bool is_http3_broken) { is_http3_broken_ = is_http3_broken; }
// Returns true if HTTP/3 is currently broken. While HTTP/3 is broken the grid will not
// attempt to make new HTTP/3 connections.
bool isHttp3Broken() const;

// Marks HTTP/3 broken for a period of time subject to exponential backoff. While HTTP/3
// is broken the grid will not attempt to make new HTTP/3 connections.
void markHttp3Broken();

// Marks that HTTP/3 is working, which resets the exponential backoff counter in the
// event that HTTP/3 is marked broken again.
void markHttp3Confirmed();

private:
friend class ConnectivityGridForTest;
Expand All @@ -174,7 +184,7 @@ class ConnectivityGrid : public ConnectionPool::Instance,
Upstream::ClusterConnectivityState& state_;
std::chrono::milliseconds next_attempt_duration_;
TimeSource& time_source_;
bool is_http3_broken_{};
Http3StatusTracker http3_status_tracker_;

// Tracks how many drains are needed before calling drain callbacks. This is
// set to the number of pools when the first drain callbacks are added, and
Expand Down
49 changes: 49 additions & 0 deletions source/common/http/http3_status_tracker.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#include "common/http/http3_status_tracker.h"

namespace Envoy {
namespace Http {

namespace {

// Initially, HTTP/3 is be marked broken for 5 minutes.
const std::chrono::minutes DefaultExpirationTime{5};
// Cap the broken period at just under 1 day.
const int MaxConsecutiveBrokenCount = 8;
} // namespace

Http3StatusTracker::Http3StatusTracker(Event::Dispatcher& dispatcher)
: expiration_timer_(dispatcher.createTimer([this]() -> void { onExpirationTimeout(); })) {}

bool Http3StatusTracker::isHttp3Broken() const { return state_ == State::Broken; }

bool Http3StatusTracker::isHttp3Confirmed() const { return state_ == State::Confirmed; }

void Http3StatusTracker::markHttp3Broken() {
state_ = State::Broken;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are valid values for state_ when entering this method?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good question! Mentally, I'm modeling this off of the similar code in Chrome. That code runs up at the request/response layer (the HttpNetworkTransaction) which is above the connection establishment layer. As such and given that requests happen in parallel, it's possible for basically any sequence of markBroken/markConfirmed calls to arrive in any order. I suspect that we'll eventually want something similar. But since we're not doing anything like that now, there's no need to permit such state transitions. So I've added ASSERT() calls to make it clear what the valid states are. Thanks for pointing this out.

(In any case, this should be reachable from any state other than broken)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It may be possible to trigger this ASSERT, if there are 2 concurrent attempts to connect to the same endpoint. That said, it's possible that there are protections elsewhere to prevent this from happening or it is relatively unlikely to happen without a burst of requests for that service; we would need the number of requests to exceed the upstream's multiplexing factor and trigger creation of a second connection in order to meet demand.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah! Excellent point! That's very true. Ok, in that case we're back to the Chrome situation where the parallelism means that we can really get any sequence of events in any order. I've removed the ASSERT() calls.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just for my own curiosity why would we have multiple concurrent connection attempts to the same endpoint on a given worker thread? Is this related to prefetching or part of the grid logic?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm relatively unclear at this on exactly how this all plays together. From what Antonio said, it sound like if there were a sufficient number of simultaneous requests we might trigger the creation of a second attempt. The other case that I think I heard from alyssa is that it's possible to have multiple calls to ConnectivityGrid::newStream() happen before the first call finishes. This won't result in multiple TCP/QUIC connection attempts because the underlying connection pool will do the right thing. But I think this is transparent to the ConnectivtyGrid; each call to newStream creates a new WrapperCallbacks and (up to) 2 ConnectionAttempts which should mean it's possible to get unexpected state transitions. Happy to do something different if I'm not understanding.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah this sounds plausible to me: newStream is non-blocking (like most things in Envoy), so if multiple streams are established before the connection is established so you'd see multiple newStreams come in before the connection is established.

No change necessary, I was just curious how this all fit together :)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Optional (I mean in this PR, not optional overall) I wonder if we should start landing docs on this as we start implementing the "real" logic.

Generally we land docs when we unhide config (#15926) but the failover logic is sufficiently complicated I think we could land docs for now in source/docs and then move them to docs/ when the PR lands. Your call if we do them now or in a future iteration :-)

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Docs definitely make sense and I'm happy to work on them. I think I'll do that in a follow-up since this PR is (hopefully) basically done at this point.

if (!expiration_timer_->enabled()) {
expiration_timer_->enableTimer(std::chrono::duration_cast<std::chrono::milliseconds>(
DefaultExpirationTime * (1 << consecutive_broken_count_)));
if (consecutive_broken_count_ < MaxConsecutiveBrokenCount) {
++consecutive_broken_count_;
}
}
}

void Http3StatusTracker::markHttp3Confirmed() {
state_ = State::Confirmed;
consecutive_broken_count_ = 0;
if (expiration_timer_->enabled()) {
expiration_timer_->disableTimer();
}
}

void Http3StatusTracker::onExpirationTimeout() {
if (state_ != State::Broken) {
return;
}

state_ = State::Pending;
}

} // namespace Http
} // namespace Envoy
42 changes: 42 additions & 0 deletions source/common/http/http3_status_tracker.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#pragma once

#include "envoy/event/dispatcher.h"
#include "envoy/event/timer.h"

namespace Envoy {
namespace Http {

// Tracks the status of HTTP/3 being broken for a period of time
// subject to exponential backoff.
class Http3StatusTracker {
public:
explicit Http3StatusTracker(Event::Dispatcher& dispatcher);

// Returns true if HTTP/3 is broken.
bool isHttp3Broken() const;
// Returns true if HTTP/3 is confirmed to be working.
bool isHttp3Confirmed() const;
// Marks HTTP/3 broken for a period of time, subject to backoff.
void markHttp3Broken();
// Marks HTTP/3 as confirmed to be working and resets the backoff timeout.
void markHttp3Confirmed();

private:
enum class State {
Pending,
Broken,
Confirmed,
};

// Called when the expiration timer fires.
void onExpirationTimeout();

State state_{State::Pending};
// The number of consecutive times HTTP/3 has been marked broken.
int consecutive_broken_count_{};
// The timer which tracks when HTTP/3 broken status should expire
Event::TimerPtr expiration_timer_;
};

} // namespace Http
} // namespace Envoy
10 changes: 10 additions & 0 deletions test/common/http/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -440,6 +440,16 @@ envoy_cc_test(
]),
)

envoy_cc_test(
name = "http3_status_tracker_test",
srcs = ["http3_status_tracker_test.cc"],
deps = [
":common_lib",
"//source/common/http:http3_status_tracker",
"//test/mocks:common_lib",
],
)

envoy_cc_test(
name = "alternate_protocols_test",
srcs = ["alternate_protocols_test.cc"],
Expand Down
2 changes: 1 addition & 1 deletion test/common/http/conn_pool_grid_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -424,7 +424,7 @@ TEST_F(ConnectivityGridTest, NoDrainOnTeardown) {

// Test that when HTTP/3 is broken then the HTTP/3 pool is skipped.
TEST_F(ConnectivityGridTest, SuccessAfterBroken) {
grid_.setIsHttp3Broken(true);
grid_.markHttp3Broken();
EXPECT_EQ(grid_.first(), nullptr);

EXPECT_LOG_CONTAINS("trace", "HTTP/3 is broken to host 'first', skipping.",
Expand Down
Loading