Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -193,7 +193,10 @@ private static void ShareHttpHandlers(
{
PooledConnectionLifetime = TimeSpan.FromMinutes(10), // Customize this value based on desired DNS refresh timer
MaxConnectionsPerServer = 20, // Customize the maximum number of allowed connections
EnableMultipleHttp2Connections = true // Recommended for thin client (HTTP/2) mode to open additional connections when stream limits are reached
EnableMultipleHttp2Connections = true, // Recommended for thin client (HTTP/2) mode to open additional connections when stream limits are reached
KeepAlivePingDelay = TimeSpan.FromSeconds(1), // Send HTTP/2 PING after 1s of inactivity to detect broken connections
KeepAlivePingTimeout = TimeSpan.FromSeconds(2), // Mark connection dead if no PONG within 2s
KeepAlivePingPolicy = HttpKeepAlivePingPolicy.Always
};

CosmosClientOptions cosmosClientOptions = new CosmosClientOptions()
Expand Down
23 changes: 17 additions & 6 deletions Microsoft.Azure.Cosmos/src/CosmosClientOptions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -244,11 +244,19 @@ public string ApplicationName
/// </summary>
/// <remarks>
/// This setting is only applicable in Gateway mode.
/// The SDK sets EnableMultipleHttp2Connections = true on the underlying SocketsHttpHandler,
/// allowing additional HTTP/2 TCP connections to be opened when the maximum concurrent streams
/// limit on an existing connection is reached. This property controls the upper bound on the
/// total number of connections per server endpoint.
/// When using a custom <see cref="HttpClientFactory"/>, set EnableMultipleHttp2Connections
/// The SDK sets the following on the underlying SocketsHttpHandler:
/// <list type="bullet">
/// <item><description>EnableMultipleHttp2Connections = true — allows additional HTTP/2 TCP connections
/// to be opened when the maximum concurrent streams limit on an existing connection is reached.</description></item>
/// <item><description>KeepAlivePingDelay = 1 second — sends HTTP/2 PING frames after 1 second
/// of inactivity to detect broken connections in the pool.</description></item>
/// <item><description>KeepAlivePingTimeout = 2 seconds — marks a connection as dead if no PONG
/// response is received within 2 seconds.</description></item>
/// <item><description>KeepAlivePingPolicy = Always — sends pings even for idle connections, which
/// is critical for detecting broken connections that remain in the pool.</description></item>
/// </list>
/// This property controls the upper bound on the total number of connections per server endpoint.
/// When using a custom <see cref="HttpClientFactory"/>, configure these properties
/// directly on your SocketsHttpHandler for equivalent behavior.
/// </remarks>
/// <example>
Expand All @@ -268,7 +276,10 @@ public string ApplicationName
/// SocketsHttpHandler handler = new SocketsHttpHandler
/// {
/// MaxConnectionsPerServer = 100,
/// EnableMultipleHttp2Connections = true
/// EnableMultipleHttp2Connections = true,
/// KeepAlivePingDelay = TimeSpan.FromSeconds(1),
/// KeepAlivePingTimeout = TimeSpan.FromSeconds(2),
/// KeepAlivePingPolicy = HttpKeepAlivePingPolicy.Always
/// };
/// CosmosClientOptions options = new CosmosClientOptions()
/// {
Expand Down
36 changes: 36 additions & 0 deletions Microsoft.Azure.Cosmos/src/HttpClient/CosmosHttpClientCore.cs
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,42 @@ public static HttpMessageHandler CreateSocketsHttpHandlerHelper(
DefaultTrace.TraceWarning("Failed to set EnableMultipleHttp2Connections on SocketsHttpHandler: {0}", ex.Message);
}

// Enable HTTP/2 PING keep-alive to detect broken connections.
// Without this, a broken HTTP/2 connection (e.g. after a network blip or load balancer
// reset) can remain in the pool indefinitely, causing persistent request failures
// that only resolve after application restart.
// KeepAlivePingDelay/Timeout/Policy are available on SocketsHttpHandler in .NET 5.0+.
try
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Recommendation — Resilience: Separate try-catch blocks for independent property sets

The single try/catch block (lines 191–220) groups env var parsing, KeepAlivePingDelay, KeepAlivePingTimeout, and KeepAlivePingPolicy together. If any one operation fails — including a FormatException from a non-numeric env var string, or a TargetInvocationException from the delay setter — the entire block is skipped, including the critical KeepAlivePingPolicy = Always.

The sibling EnableMultipleHttp2Connections just above (lines 176–184) uses its own isolated try-catch, which is the established pattern in this method. Each logical property set is independent and should fail independently.

Concrete failure scenario: A user sets AZURE_COSMOS_HTTP2_KEEPALIVE_PING_DELAY_IN_SECONDS=fast (non-numeric). Convert.ChangeType throws before any property is set → the catch fires → delay, timeout, AND policy are all skipped → the handler has .NET defaults (KeepAlivePingDelay = Timeout.InfiniteTimeSpan, KeepAlivePingPolicy = WithActiveRequests) → PING keep-alive is completely disabled. A bad delay value shouldn't prevent the timeout and policy from being configured with their defaults.

Suggested structure:

// Block 1: Parse env vars (with defaults)
int pingDelayInSeconds = /* ... defaultValue: 1 */;
int pingTimeoutInSeconds = /* ... defaultValue: 2 */;

// Block 2: Set delay (independent)
try { keepAlivePingDelayInfo?.SetValue(...); }
catch (Exception ex) { DefaultTrace.TraceWarning(...); }

// Block 3: Set timeout (independent)
try { keepAlivePingTimeoutInfo?.SetValue(...); }
catch (Exception ex) { DefaultTrace.TraceWarning(...); }

// Block 4: Set policy (independent)
try { /* Enum.ToObject + SetValue */ }
catch (Exception ex) { DefaultTrace.TraceWarning(...); }

This way, a failure in one property still allows the others to be applied — all partial states are safe since the .NET defaults are reasonable fallbacks.

⚠️ AI-generated review — may be incorrect. Agree? → resolve the conversation. Disagree? → reply with your reasoning.

{
int pingDelayInSeconds = ConfigurationManager.GetEnvironmentVariable<int>(
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟡 Recommendation — Resilience: Add Math.Max clamping on env var overrides to prevent silent feature disablement

int pingDelayInSeconds = ConfigurationManager.GetEnvironmentVariable<int>(
    ConfigurationManager.Http2KeepAlivePingDelayInSeconds,
    defaultValue: 1);

int pingTimeoutInSeconds = ConfigurationManager.GetEnvironmentVariable<int>(
    ConfigurationManager.Http2KeepAlivePingTimeoutInSeconds,
    defaultValue: 2);

Both KeepAlivePingDelay and KeepAlivePingTimeout on SocketsHttpHandler require values strictly greater than TimeSpan.Zero (the setter throws ArgumentOutOfRangeException for <= TimeSpan.Zero). If a user sets AZURE_COSMOS_HTTP2_KEEPALIVE_PING_DELAY_IN_SECONDS=0 or a negative value:

  1. GetEnvironmentVariable<int> parses successfully (valid int)
  2. TimeSpan.FromSeconds(0)TimeSpan.Zero
  3. SetValue throws ArgumentOutOfRangeException
  4. Caught by catch (Exception ex)all three settings skipped (delay, timeout, AND policy)
  5. Handler has .NET defaults: KeepAlivePingDelay = Timeout.InfiniteTimeSpanpings disabled entirely

The feature this PR adds is silently defeated with only a trace warning. This is especially subtle because the delay failure also prevents the KeepAlivePingPolicy = Always from being set — so even if only the timeout env var is invalid, the policy reverts to WithActiveRequests (which doesn't ping idle connections, defeating the purpose).

The codebase already has an established pattern for this in ConfigurationManager.cs:159-166:

// Existing pattern
return Math.Max(
    ConfigurationManager.GetEnvironmentVariable(..., defaultValue: ...),
    MinMaxRetriesInLocalRegionWhenRemoteRegionPreferred);

Suggested fix:

int pingDelayInSeconds = Math.Max(1, ConfigurationManager.GetEnvironmentVariable<int>(
    ConfigurationManager.Http2KeepAlivePingDelayInSeconds,
    defaultValue: 1));

int pingTimeoutInSeconds = Math.Max(1, ConfigurationManager.GetEnvironmentVariable<int>(
    ConfigurationManager.Http2KeepAlivePingTimeoutInSeconds,
    defaultValue: 2));

This guarantees values are always ≥ 1 second, which is the minimum SocketsHttpHandler accepts, and follows the existing clamping convention.

⚠️ AI-generated review — may be incorrect. Agree? → resolve the conversation. Disagree? → reply with your reasoning.

ConfigurationManager.Http2KeepAlivePingDelayInSeconds,
defaultValue: 1);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Defaults seems very aggressive.
Lets align with eventual RUST expected values.

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like there are the RUST defaults.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment thread
kirankumarkolli marked this conversation as resolved.

int pingTimeoutInSeconds = ConfigurationManager.GetEnvironmentVariable<int>(
ConfigurationManager.Http2KeepAlivePingTimeoutInSeconds,
defaultValue: 2);

PropertyInfo keepAlivePingDelayInfo = socketHandlerType.GetProperty("KeepAlivePingDelay");
keepAlivePingDelayInfo?.SetValue(socketHttpHandler, TimeSpan.FromSeconds(pingDelayInSeconds));

PropertyInfo keepAlivePingTimeoutInfo = socketHandlerType.GetProperty("KeepAlivePingTimeout");
keepAlivePingTimeoutInfo?.SetValue(socketHttpHandler, TimeSpan.FromSeconds(pingTimeoutInSeconds));

// HttpKeepAlivePingPolicy.Always = 1: send pings even for idle connections,
// which is critical for detecting broken connections lingering in the pool.
PropertyInfo keepAlivePingPolicyInfo = socketHandlerType.GetProperty("KeepAlivePingPolicy");
if (keepAlivePingPolicyInfo != null)
{
Type pingPolicyType = keepAlivePingPolicyInfo.PropertyType;
object alwaysValue = Enum.ToObject(pingPolicyType, 1);
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🟢 Suggestion — Resilience: Prefer Enum.Parse by name over Enum.ToObject by ordinal

object alwaysValue = Enum.ToObject(pingPolicyType, 1);

The hardcoded 1 is correct today (HttpKeepAlivePingPolicy.Always = 1), but it's an ordinal assumption with no compile-time validation. If Microsoft ever inserted a new enum member before Always, this would silently apply the wrong policy.

Consider using name-based lookup instead:

object alwaysValue = Enum.Parse(pingPolicyType, "Always");

This is self-documenting, resilient to ordinal changes, and equally correct. The performance difference is irrelevant — this runs once at client startup. Enum.Parse will throw ArgumentException if the name changes (surfacing the problem immediately), whereas Enum.ToObject would silently apply the wrong policy.


⚠️ AI-generated review — may be incorrect. Agree? → resolve the conversation. Disagree? → reply with your reasoning.

keepAlivePingPolicyInfo.SetValue(socketHttpHandler, alwaysValue);
}
}
catch (Exception ex)
{
DefaultTrace.TraceWarning("Failed to configure HTTP/2 keep-alive ping on SocketsHttpHandler: {0}", ex.Message);
}

if (serverCertificateCustomValidationCallback != null)
{
//Get SslOptions Property
Expand Down
13 changes: 13 additions & 0 deletions Microsoft.Azure.Cosmos/src/Util/ConfigurationManager.cs
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,19 @@ internal static class ConfigurationManager
/// </summary>
internal static readonly string TcpDnsDotSuffixEnabled = "AZURE_COSMOS_TCP_DNS_DOT_SUFFIX_ENABLED";

/// <summary>
/// Environment variable to override the HTTP/2 PING keep-alive delay (in seconds).
/// After this many seconds of inactivity on an HTTP/2 connection, a PING frame is sent
/// to detect broken connections in the pool. Default: 1 second.
/// </summary>
internal static readonly string Http2KeepAlivePingDelayInSeconds = "AZURE_COSMOS_HTTP2_KEEPALIVE_PING_DELAY_IN_SECONDS";

/// <summary>
/// Environment variable to override the HTTP/2 PING keep-alive timeout (in seconds).
/// If no PONG response is received within this time, the connection is marked dead. Default: 2 seconds.
/// </summary>
internal static readonly string Http2KeepAlivePingTimeoutInSeconds = "AZURE_COSMOS_HTTP2_KEEPALIVE_PING_TIMEOUT_IN_SECONDS";

public static T GetEnvironmentVariable<T>(string variable, T defaultValue)
{
string value = Environment.GetEnvironmentVariable(variable);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -829,6 +829,10 @@ public void VerifyHttpClientHandlerIsSet()
SocketsHttpHandler handler = (SocketsHttpHandler)cosmosHttpClient.HttpMessageHandler;

Assert.IsTrue(object.ReferenceEquals(webProxy, handler.Proxy));
Assert.IsTrue(handler.EnableMultipleHttp2Connections, "EnableMultipleHttp2Connections should be set through the builder pipeline");
Assert.AreEqual(TimeSpan.FromSeconds(1), handler.KeepAlivePingDelay, "KeepAlivePingDelay should be set through the builder pipeline");
Assert.AreEqual(TimeSpan.FromSeconds(2), handler.KeepAlivePingTimeout, "KeepAlivePingTimeout should be set through the builder pipeline");
Assert.AreEqual(HttpKeepAlivePingPolicy.Always, handler.KeepAlivePingPolicy, "KeepAlivePingPolicy should be set through the builder pipeline");
}

[TestMethod]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -466,6 +466,11 @@ public void CreateSocketsHttpHandlerCreatesCorrectValueType()
Assert.AreEqual(gatewayLimit, socketsHandler.MaxConnectionsPerServer);
Assert.IsTrue(socketsHandler.EnableMultipleHttp2Connections, "EnableMultipleHttp2Connections should be true for HTTP/2 thin client support");

// HTTP/2 PING keep-alive: detects broken connections lingering in the pool
Assert.AreEqual(TimeSpan.FromSeconds(1), socketsHandler.KeepAlivePingDelay, "KeepAlivePingDelay should be 1 second for HTTP/2 connection health monitoring");
Assert.AreEqual(TimeSpan.FromSeconds(2), socketsHandler.KeepAlivePingTimeout, "KeepAlivePingTimeout should be 2 seconds");
Assert.AreEqual(HttpKeepAlivePingPolicy.Always, socketsHandler.KeepAlivePingPolicy, "KeepAlivePingPolicy should be Always to detect broken idle connections");

//Create cert for test
X509Certificate2 x509Certificate2 = new CertificateRequest("cn=www.test", ECDsa.Create(), HashAlgorithmName.SHA256).CreateSelfSigned(DateTime.Now, DateTime.Now.AddYears(1));
X509Chain x509Chain = new X509Chain();
Expand Down Expand Up @@ -495,6 +500,46 @@ public void CreateHttpClientHandlerCreatesCorrectValueType()
Assert.IsFalse(clientHandler.ServerCertificateCustomValidationCallback.Invoke(new HttpRequestMessage(), x509Certificate2, x509Chain, sslPolicyErrors));
}

[TestMethod]
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Possible concurrency issue with other tests?

public void CreateSocketsHttpHandlerRespectsEnvironmentVariableOverrides()
{
int customPingDelay = 60;
int customPingTimeout = 10;

try
{
Environment.SetEnvironmentVariable(
ConfigurationManager.Http2KeepAlivePingDelayInSeconds,
customPingDelay.ToString());
Environment.SetEnvironmentVariable(
ConfigurationManager.Http2KeepAlivePingTimeoutInSeconds,
customPingTimeout.ToString());

HttpMessageHandler handler = CosmosHttpClientCore.CreateSocketsHttpHandlerHelper(
gatewayModeMaxConnectionLimit: 10,
webProxy: null,
serverCertificateCustomValidationCallback: null);

SocketsHttpHandler socketsHandler = (SocketsHttpHandler)handler;

Assert.AreEqual(TimeSpan.FromSeconds(customPingDelay), socketsHandler.KeepAlivePingDelay,
"KeepAlivePingDelay should respect environment variable override");
Assert.AreEqual(TimeSpan.FromSeconds(customPingTimeout), socketsHandler.KeepAlivePingTimeout,
"KeepAlivePingTimeout should respect environment variable override");
Assert.AreEqual(HttpKeepAlivePingPolicy.Always, socketsHandler.KeepAlivePingPolicy,
"KeepAlivePingPolicy should always be Always regardless of environment variables");
}
finally
{
Environment.SetEnvironmentVariable(
ConfigurationManager.Http2KeepAlivePingDelayInSeconds,
null);
Environment.SetEnvironmentVariable(
ConfigurationManager.Http2KeepAlivePingTimeoutInSeconds,
null);
}
}

[TestMethod]
public async Task HttpTimeoutPolicyForThinClientOn503TestAsync()
{
Expand Down
Loading