Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

misc: tweak metrics to support service-level benchmarks #908

Merged
merged 3 commits into from
Aug 9, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions .changes/73448090-7b5b-4c5a-875a-730efacacf9d.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
{
"id": "73448090-7b5b-4c5a-875a-730efacacf9d",
"type": "misc",
"description": "Tweak metrics to better support service-level benchmarks",
"issues": [
"awslabs/aws-sdk-kotlin#968"
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@ public typealias DoubleHistogram = Histogram<Double>
*/
@InternalApi
public fun DoubleHistogram.recordSeconds(value: Duration, attributes: Attributes = emptyAttributes(), context: Context? = null) {
val ms = value.inWholeMilliseconds.toDouble()
val sec = ms / 1000
val ms = value.inWholeNanoseconds.toDouble()
val sec = ms / 1_000_000_000
record(sec, attributes, context)
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
package aws.smithy.kotlin.runtime.http.engine.okhttp

import aws.smithy.kotlin.runtime.ExperimentalApi
import aws.smithy.kotlin.runtime.http.engine.EngineAttributes
import aws.smithy.kotlin.runtime.http.engine.internal.HttpClientMetrics
import aws.smithy.kotlin.runtime.net.HostResolver
import aws.smithy.kotlin.runtime.net.toHostAddress
Expand Down Expand Up @@ -54,6 +55,8 @@ internal class HttpEngineEventListener(
private var signaledConnectAcquireDuration = false
private var dnsStartTime: TimeMark? = null

private var requestTimeEnd: TimeMark? = null

private inline fun trace(crossinline msg: MessageSupplier) {
logger.trace { msg() }
}
Expand Down Expand Up @@ -158,14 +161,22 @@ internal class HttpEngineEventListener(

override fun requestBodyStart(call: Call) = trace { "sending request body" }

override fun requestBodyEnd(call: Call, byteCount: Long) =
override fun requestBodyEnd(call: Call, byteCount: Long) {
requestTimeEnd = TimeSource.Monotonic.markNow()
trace { "finished sending request body: bytesSent=$byteCount" }
}

override fun requestFailed(call: Call, ioe: IOException) = trace(ioe) { "request failed" }

override fun requestHeadersStart(call: Call) = trace { "sending request headers" }

override fun requestHeadersEnd(call: Call, request: Request) = trace { "finished sending request headers" }
override fun requestHeadersEnd(call: Call, request: Request) {
if (request.body == null) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

correctness: Shouldn't this be after sending the body? If there is no body this is fine, we should be able to detect that though and either set it here or in responseBodyEnd.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm a little confused about this comment, requestTimeEnd is set in both places: requestHeadersEnd if there is no body, and requestBodyEnd if there is a body, so I think it's correct?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you're right, I just missed it in first review

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes what confused me is that this entire file has the events out-of-order (i.e requestHeadersEnd is defined after requestBodyStart even though it occurs before in the chain of events). It could be worth reordering these to match the order shown in the OkHttp docs.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, that would make it easier to parse this file. I can submit a mini-PR after this one gets merged.

requestTimeEnd = TimeSource.Monotonic.markNow()
}

trace { "finished sending request headers" }
}

override fun responseBodyStart(call: Call) = trace { "response body available" }

Expand All @@ -174,7 +185,13 @@ internal class HttpEngineEventListener(

override fun responseFailed(call: Call, ioe: IOException) = trace(ioe) { "response failed" }

override fun responseHeadersStart(call: Call) = trace { "response headers start" }
override fun responseHeadersStart(call: Call) {
requestTimeEnd?.elapsedNow()?.let { ttfb ->
metrics.timeToFirstByteDuration.recordSeconds(ttfb)
call.request().tag<SdkRequestTag>()?.execContext?.set(EngineAttributes.TimeToFirstByte, ttfb)
}
trace { "response headers start" }
}

override fun responseHeadersEnd(call: Call, response: Response) {
val contentLength = response.body.contentLength()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
/*
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0
*/

package aws.smithy.kotlin.runtime.http.engine

import aws.smithy.kotlin.runtime.InternalApi
import aws.smithy.kotlin.runtime.util.AttributeKey
import kotlin.time.Duration

@InternalApi
public object EngineAttributes {
public val TimeToFirstByte: AttributeKey<Duration> = AttributeKey("TimeToFirstByte")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fix: docs

}
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,12 @@ public class HttpClientMetrics(
"The total number of bytes received by the HTTP client",
)

public val timeToFirstByteDuration: DoubleHistogram = meter.createDoubleHistogram(
"smithy.client.http.time_to_first_byte",
"s",
"The amount of time after a request has been sent spent waiting on a response from the remote server",
)

/**
* The maximum number of connections configured for the client
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@ import aws.smithy.kotlin.runtime.client.ProtocolRequestInterceptorContext
import aws.smithy.kotlin.runtime.client.ProtocolResponseInterceptorContext
import aws.smithy.kotlin.runtime.client.RequestInterceptorContext
import aws.smithy.kotlin.runtime.client.ResponseInterceptorContext
import aws.smithy.kotlin.runtime.http.engine.EngineAttributes
import aws.smithy.kotlin.runtime.http.operation.OperationMetrics
import aws.smithy.kotlin.runtime.http.request.HttpRequest
import aws.smithy.kotlin.runtime.http.response.HttpResponse
import aws.smithy.kotlin.runtime.telemetry.metrics.recordSeconds
import aws.smithy.kotlin.runtime.util.attributesOf
import aws.smithy.kotlin.runtime.util.get
import aws.smithy.kotlin.runtime.util.merge
import aws.smithy.kotlin.runtime.util.mutableAttributesOf
import kotlin.time.ExperimentalTime
Expand All @@ -39,7 +41,7 @@ internal class OperationTelemetryInterceptor(
private var callStart: TimeMark? = null
private var serializeStart: TimeMark? = null
private var deserializeStart: TimeMark? = null
private var transmitStart: TimeMark? = null
private var attemptStart: TimeMark? = null
private var attempts = 0

private val perRpcAttributes = attributesOf {
Expand Down Expand Up @@ -94,6 +96,13 @@ internal class OperationTelemetryInterceptor(
if (attempts > 1) {
metrics.rpcRetryCount.add(1L, perRpcAttributes, metrics.provider.contextManager.current())
}

val attemptDuration = attemptStart?.elapsedNow() ?: return
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Change the implementation of smithy.client.attempt_duration to measure attempt duration instead of merely transmission duration

This now includes signing and other things right? The current description of the metric was accurate for what it measured: The time it takes to connect to the service, send the request, and receive the HTTP status code and headers from the response for an operation. If we are updating the metric definition lets update the description to match.

Can you first explain why this is needed/better? Just want to understand the motivation for the change.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this now includes identity resolution, endpoint resolution, and signing. The inclusion of EP resolution and signing will likely have minimal impact to the metric but identity resolution is a potentially-expensive operation since it may involve local IO or calls to remote services. Given that the intent is to measure the SDK-added overhead and that identity resolution, EP resolution, and signing are per-attempt, we'd need to know the duration of the entire attempt.

I'll update the metric documentation.

metrics.rpcAttemptDuration.recordSeconds(attemptDuration, perRpcAttributes, metrics.provider.contextManager.current())

context.executionContext.getOrNull(EngineAttributes.TimeToFirstByte)?.let { ttfb ->
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: Should we clear this after reading it? e.g. what if we retry and it fails before recording TTFB? We'll be using the first TTFB value.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right we may have an issue if we fail to record TTFB on a retry. We certainly can clear it after reading it, although this makes me wonder if there's a new set of execution context which is scoped to a single attempt and should be cleared when an attempt is ended.

What would we think about a new optional scope for attributes:

interface MutableAttributes : Attributes {
    // ...existing set, remove, and computeIfAbsent methods...

    operator fun <T : Any> set(scope: AttributeScope, key: AttributeKey<T>, value: T)
    fun <T : Any> removeAll(scope: AttributeScope)
}

Get operations would find an attribute regardless of its scope but removeAll could be used to clear all attributes with a specific scope (e.g., an attempt).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm maybe. Seems like it may complicate some things but wouldn't know until I tried. Also raises questions like what attributes are visible? If I set an attribute in one scope but not another are they visible to other scopes? Are they independent (i.e. can they shadow each other)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right, this is likely too complicated without a deeper design. I'll just remove it upon reading.

metrics.rpcAttemptOverheadDuration.recordSeconds(attemptDuration - ttfb, perRpcAttributes)
}
}

override fun readBeforeDeserialization(context: ProtocolResponseInterceptorContext<Any, HttpRequest, HttpResponse>) {
Expand All @@ -105,12 +114,7 @@ internal class OperationTelemetryInterceptor(
metrics.deserializationDuration.recordSeconds(deserializeDuration, perRpcAttributes, metrics.provider.contextManager.current())
}

override fun readBeforeTransmit(context: ProtocolRequestInterceptorContext<Any, HttpRequest>) {
transmitStart = timeSource.markNow()
}

override fun readAfterTransmit(context: ProtocolResponseInterceptorContext<Any, HttpRequest, HttpResponse>) {
val serviceCallDuration = transmitStart?.elapsedNow() ?: return
metrics.rpcAttemptDuration.recordSeconds(serviceCallDuration, perRpcAttributes, metrics.provider.contextManager.current())
override fun readBeforeAttempt(context: ProtocolRequestInterceptorContext<Any, HttpRequest>) {
attemptStart = timeSource.markNow()
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ public class OperationMetrics(
public val rpcRequestSize: LongHistogram = meter.createLongHistogram("smithy.client.request.size", "By", "Size of the serialized request message")
public val rpcResponseSize: LongHistogram = meter.createLongHistogram("smithy.client.response.size", "By", "Size of the serialized response message")
public val rpcAttemptDuration: DoubleHistogram = meter.createDoubleHistogram("smithy.client.attempt_duration", "s", "The time it takes to connect to the service, send the request, and receive the HTTP status code and headers from the response for an operation")
public val rpcAttemptOverheadDuration: DoubleHistogram = meter.createDoubleHistogram("smithy.client.attempt_overhead_duration", "s", "The time it takes to execute an attempt minus the time spent waiting for a response from the remote server")
public val serializationDuration: DoubleHistogram = meter.createDoubleHistogram("smithy.client.serialization_duration", "s", "The time it takes to serialize a request message body")
public val deserializationDuration: DoubleHistogram = meter.createDoubleHistogram("smithy.client.deserialization_duration", "s", "The time it takes to deserialize a response message body")
public val resolveEndpointDuration: DoubleHistogram = meter.createDoubleHistogram("smithy.client.resolve_endpoint_duration", "s", "The time it takes to resolve an endpoint for a request")
Expand Down