Skip to content

Commit

Permalink
Fix instance where context cancellation error was cached (#10537)
Browse files Browse the repository at this point in the history
* Fix instance where context cancellation error was cached

This change fixes the error mapping done in the query sharding
middleware to correctly map all possible errors from the PromQL
engine to an appropriate APIError type. This fixes an issue where
`context.Canceled` was treated as an execution error and cached
for subsequent requests.

As part of this, instrumentation for the error caching middleware
was improved and Jaeger tracing in various Docker compose setups
was fixed by pinning the image used.

Signed-off-by: Nick Pillitteri <[email protected]>

* Changelog

Signed-off-by: Nick Pillitteri <[email protected]>

---------

Signed-off-by: Nick Pillitteri <[email protected]>
  • Loading branch information
56quarters authored Jan 30, 2025
1 parent 0b61a68 commit ce9463e
Show file tree
Hide file tree
Showing 10 changed files with 105 additions and 9 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
* [BUGFIX] Distributor: Use a boolean to track changes while merging the ReplicaDesc components, rather than comparing the objects directly. #10185
* [BUGFIX] Querier: fix timeout responding to query-frontend when response size is very close to `-querier.frontend-client.grpc-max-send-msg-size`. #10154
* [BUGFIX] Query-frontend and querier: show warning/info annotations in some cases where they were missing (if a lazy querier was used). #10277
* [BUGFIX] Query-frontend: Fix an issue where transient errors could be inadvertently cached. #10537
* [BUGFIX] Ruler: fix indeterminate rules being always run concurrently (instead of never) when `-ruler.max-independent-rule-evaluation-concurrency` is set. https://github.com/prometheus/prometheus/pull/15560 #10258
* [BUGFIX] PromQL: Fix various UTF-8 bugs related to quoting. https://github.com/prometheus/prometheus/pull/15531 #10258
* [BUGFIX] Ruler: Fixed an issue when using the experimental `-ruler.max-independent-rule-evaluation-concurrency` feature, where if a rule group was eligible for concurrency, it would flap between running concurrently or not based on the time it took after running concurrently. #9726 #10189
Expand Down
3 changes: 2 additions & 1 deletion development/mimir-ingest-storage/docker-compose.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,8 @@ std.manifestYamlDoc({

jaeger:: {
jaeger: {
image: 'jaegertracing/all-in-one',
// Use 1.62 specifically since 1.63 removes the agent which we depend on for now.
image: 'jaegertracing/all-in-one:1.62.0',
ports: ['16686:16686', '14268'],
},
},
Expand Down
2 changes: 1 addition & 1 deletion development/mimir-ingest-storage/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
"volumes":
- "./config:/etc/agent-config"
"jaeger":
"image": "jaegertracing/all-in-one"
"image": "jaegertracing/all-in-one:1.62.0"
"ports":
- "16686:16686"
- "14268"
Expand Down
3 changes: 2 additions & 1 deletion development/mimir-microservices-mode/docker-compose.jsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,8 @@ std.manifestYamlDoc({

jaeger:: {
jaeger: {
image: 'jaegertracing/all-in-one',
// Use 1.62 specifically since 1.63 removes the agent which we depend on for now.
image: 'jaegertracing/all-in-one:1.62.0',
ports: ['16686:16686', '14268'],
},
},
Expand Down
2 changes: 1 addition & 1 deletion development/mimir-microservices-mode/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@
- "./activity:/activity"
- ".data-ingester-3:/tmp/mimir-tsdb-ingester:delegated"
"jaeger":
"image": "jaegertracing/all-in-one"
"image": "jaegertracing/all-in-one:1.62.0"
"ports":
- "16686:16686"
- "14268"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,8 @@ services:
- 9091:9091

jaeger:
image: jaegertracing/all-in-one
# Use 1.62 specifically since 1.63 removes the agent which we depend on for now.
image: jaegertracing/all-in-one:1.62.0
ports:
- 16686:16686
- "14268"
Expand Down
3 changes: 2 additions & 1 deletion development/mimir-monolithic-mode/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,8 @@ services:
- 9092:9092

jaeger:
image: jaegertracing/all-in-one
# Use 1.62 specifically since 1.63 removes the agent which we depend on for now.
image: jaegertracing/all-in-one:1.62.0
ports:
- 16681:16686
- "14268"
Expand Down
29 changes: 26 additions & 3 deletions pkg/frontend/querymiddleware/error_caching.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ import (
"github.com/gogo/protobuf/proto"
"github.com/grafana/dskit/cache"
"github.com/grafana/dskit/tenant"
"github.com/grafana/dskit/tracing"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"

Expand Down Expand Up @@ -85,7 +86,7 @@ func (e *errorCachingHandler) Do(ctx context.Context, request MetricsQueryReques
return e.next.Do(ctx, request)
}

e.cacheLoadAttempted.Inc()
addWithExemplar(ctx, e.cacheLoadAttempted, 1)
key := e.keyGen.QueryRequestError(ctx, tenant.JoinTenantIDs(tenantIDs), request)
hashedKey := cacheHashKey(key)

Expand All @@ -103,7 +104,7 @@ func (e *errorCachingHandler) Do(ctx context.Context, request MetricsQueryReques

res, err := e.next.Do(ctx, request)
if err != nil {
e.cacheStoreAttempted.Inc()
addWithExemplar(ctx, e.cacheStoreAttempted, 1)

var apiErr *apierror.APIError
if !errors.As(err, &apiErr) {
Expand All @@ -123,6 +124,14 @@ func (e *errorCachingHandler) Do(ctx context.Context, request MetricsQueryReques

ttl := validation.MinDurationPerTenant(tenantIDs, e.limits.ResultsCacheTTLForErrors)
e.storeErrorToCache(key, hashedKey, ttl, apiErr, spanLog)

spanLog.DebugLog(
"msg", "stored API error to cache",
"key", key,
"hashed_key", hashedKey,
"error_type", apiErr.Type,
"error_message", apiErr.Message,
)
}

return res, err
Expand Down Expand Up @@ -163,7 +172,14 @@ func (e *errorCachingHandler) storeErrorToCache(key, hashedKey string, ttl time.
})

if err != nil {
level.Warn(spanLog).Log("msg", "unable to marshal cached error", "err", err)
level.Warn(spanLog).Log(
"msg", "unable to marshal cached error",
"key", key,
"hashed_key", hashedKey,
"error_type", apiErr.Type,
"error_message", apiErr.Message,
"err", err,
)
return
}

Expand All @@ -177,3 +193,10 @@ func (e *errorCachingHandler) isCacheable(apiErr *apierror.APIError) (bool, stri

return true, ""
}

func addWithExemplar(ctx context.Context, counter prometheus.Counter, val float64) {
// TODO: Add this to dskit as instrument.AddWithExemplar just like instrument.ObserveWithExemplar
if traceID, traceOK := tracing.ExtractSampledTraceID(ctx); traceOK {
counter.(prometheus.ExemplarAdder).AddWithExemplar(val, prometheus.Labels{"trace_id": traceID, "traceID": traceID})
}
}
7 changes: 7 additions & 0 deletions pkg/frontend/querymiddleware/querysharding.go
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,13 @@ func mapEngineError(err error) error {
cause = err
}

// The engine sometimes returns context.Canceled without mapping it to one of the expected
// error types. Handle that specially here since we rely on the error type for errors being
// accurate.
if errors.Is(cause, context.Canceled) {
return apierror.New(apierror.TypeCanceled, cause.Error())
}

// By default, all errors returned by engine.Eval() are execution errors,
// This is the same as Prometheus API does: https://github.com/prometheus/prometheus/blob/076109fa1910ad2198bf2c447a174fee31114982/web/api/v1/api.go#L550-L550
errorType := apierror.TypeExec
Expand Down
61 changes: 61 additions & 0 deletions pkg/frontend/querymiddleware/querysharding_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2614,3 +2614,64 @@ func TestRemoveAnnotationPositionInformation(t *testing.T) {
})
}
}

func TestMapEngineError(t *testing.T) {
type testCase struct {
err error
expected error
}

testCases := map[string]testCase{
"already APIError": {
err: apierror.New(apierror.TypeNotFound, "not found"),
expected: apierror.New(apierror.TypeNotFound, "not found"),
},
"context canceled": {
err: context.Canceled,
expected: apierror.New(apierror.TypeCanceled, "context canceled"),
},
"context canceled wrapped": {
err: fmt.Errorf("%w: oh no", context.Canceled),
expected: apierror.New(apierror.TypeCanceled, "context canceled"),
},
"promql canceled": {
err: promql.ErrQueryCanceled("something"),
expected: apierror.New(apierror.TypeCanceled, "query was canceled in something"),
},
"promql canceled wrapped": {
err: fmt.Errorf("%w: oh no", promql.ErrQueryCanceled("something")),
expected: apierror.New(apierror.TypeCanceled, "query was canceled in something"),
},
"promql timeout": {
err: promql.ErrQueryTimeout("something"),
expected: apierror.New(apierror.TypeTimeout, "query timed out in something"),
},
"promql timeout wrapped": {
err: fmt.Errorf("%w: oh no", promql.ErrQueryTimeout("something")),
expected: apierror.New(apierror.TypeTimeout, "query timed out in something"),
},
"promql storage": {
err: promql.ErrStorage{Err: errors.New("storage")},
expected: apierror.New(apierror.TypeInternal, "storage"),
},
"promql storage wrapped": {
err: fmt.Errorf("%w: oh no", promql.ErrStorage{Err: errors.New("storage")}),
expected: apierror.New(apierror.TypeInternal, "storage"),
},
"promql too many samples": {
err: promql.ErrTooManySamples("something"),
expected: apierror.New(apierror.TypeExec, "query processing would load too many samples into memory in something"),
},
"promql too many samples wrapped": {
err: fmt.Errorf("%w: oh no", promql.ErrTooManySamples("something")),
expected: apierror.New(apierror.TypeExec, "query processing would load too many samples into memory in something"),
},
}

for name, tc := range testCases {
t.Run(name, func(t *testing.T) {
res := mapEngineError(tc.err)
require.Equal(t, res, tc.expected)
})
}
}

0 comments on commit ce9463e

Please sign in to comment.