Skip to content

Commit d58da4a

Browse files
authored
Ruler: Report wider range of errors from remote rule evaluation queries. (#2143)
The `cortex_ruler_queries_failed_total` metric should increment whenever a rule evaluation query fails. Before this change, errors would be ignored unless they are specifically of type`httpgrpc`. This means that many errors get lost, for example, connectivity errors coming from gRPC.
1 parent 5e87331 commit d58da4a

File tree

3 files changed

+27
-12
lines changed

3 files changed

+27
-12
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
* [BUGFIX] Ring: fix bug where instances may appear unhealthy in the hash ring web UI even though they are not. #1933
3939
* [BUGFIX] API: gzip is now enforced when identity encoding is explicitly rejected. #1864
4040
* [BUGFIX] Fix panic at startup when Mimir is running in monolithic mode and query sharding is enabled. #2036
41-
* [BUGFIX] Ruler: report failed evaluation metric for any 5xx status code returned by the query-frontend when remote operational mode is enabled. #2053
41+
* [BUGFIX] Ruler: report `cortex_ruler_queries_failed_total` metric for any remote query error except 4xx when remote operational mode is enabled. #2053 #2143
4242
* [BUGFIX] Ingester: fix slow rollout when using `-ingester.ring.unregister-on-shutdown=false` with long `-ingester.ring.heartbeat-period`. #2085
4343
* [BUGFIX] Ruler: add timeout for remote rule evaluation queries to prevent rule group evaluations getting stuck indefinitely. The duration is configurable with (`-ruler.query-frontend.timeout` (default `2m`). #2090
4444
* [BUGFIX] Limits: Active series custom tracker configuration has been named back from `active_series_custom_trackers_config` to `active_series_custom_trackers`. For backwards compatibility both version is going to be supported for until Mimir v2.4. When both fields are specified, `active_series_custom_trackers_config` takes precedence over `active_series_custom_trackers`. #2101

pkg/ruler/compat.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -150,9 +150,9 @@ func MetricsQueryFunc(qf rules.QueryFunc, queries, failedQueries prometheus.Coun
150150
return result, origErr
151151

152152
} else if err != nil {
153-
// When remote querier enabled, only consider failed queries those returning a 5xx status code.
153+
// When remote querier enabled, consider anything an error except those with 4xx status code.
154154
st, ok := status.FromError(err)
155-
if ok && st.Code()/100 == 5 {
155+
if !(ok && st.Code()/100 == 4) {
156156
failedQueries.Inc()
157157
}
158158
}

pkg/ruler/compat_test.go

Lines changed: 24 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,7 @@ func TestPusherErrors(t *testing.T) {
153153
func TestMetricsQueryFuncErrors(t *testing.T) {
154154
for name, tc := range map[string]struct {
155155
returnedError error
156+
expectedError error
156157
expectedQueries int
157158
expectedFailedQueries int
158159
}{
@@ -161,44 +162,58 @@ func TestMetricsQueryFuncErrors(t *testing.T) {
161162
expectedFailedQueries: 0,
162163
},
163164

164-
"400 error": {
165+
"httpgrpc 400 error": {
165166
returnedError: httpgrpc.Errorf(http.StatusBadRequest, "test error"),
167+
expectedError: httpgrpc.Errorf(http.StatusBadRequest, "test error"),
166168
expectedQueries: 1,
167169
expectedFailedQueries: 0, // 400 errors not reported as failures.
168170
},
169171

170-
"500 error": {
172+
"httpgrpc 500 error": {
171173
returnedError: httpgrpc.Errorf(http.StatusInternalServerError, "test error"),
174+
expectedError: httpgrpc.Errorf(http.StatusInternalServerError, "test error"),
172175
expectedQueries: 1,
173176
expectedFailedQueries: 1, // 500 errors are failures
174177
},
175178

179+
"unknown but non-queryable error": {
180+
returnedError: errors.New("test error"),
181+
expectedError: errors.New("test error"),
182+
expectedQueries: 1,
183+
expectedFailedQueries: 1, // Any other error should always be reported.
184+
},
185+
176186
"promql.ErrStorage": {
177-
returnedError: promql.ErrStorage{Err: errors.New("test error")},
187+
returnedError: WrapQueryableErrors(promql.ErrStorage{Err: errors.New("test error")}),
188+
expectedError: promql.ErrStorage{Err: errors.New("test error")},
178189
expectedQueries: 1,
179190
expectedFailedQueries: 1,
180191
},
181192

182193
"promql.ErrQueryCanceled": {
183-
returnedError: promql.ErrQueryCanceled("test error"),
194+
returnedError: WrapQueryableErrors(promql.ErrQueryCanceled("test error")),
195+
expectedError: promql.ErrQueryCanceled("test error"),
184196
expectedQueries: 1,
185197
expectedFailedQueries: 0, // Not interesting.
186198
},
187199

188200
"promql.ErrQueryTimeout": {
189-
returnedError: promql.ErrQueryTimeout("test error"),
201+
returnedError: WrapQueryableErrors(promql.ErrQueryTimeout("test error")),
202+
expectedError: promql.ErrQueryTimeout("test error"),
190203
expectedQueries: 1,
191204
expectedFailedQueries: 0, // Not interesting.
192205
},
193206

194207
"promql.ErrTooManySamples": {
195-
returnedError: promql.ErrTooManySamples("test error"),
208+
returnedError: WrapQueryableErrors(promql.ErrTooManySamples("test error")),
209+
expectedError: promql.ErrTooManySamples("test error"),
196210
expectedQueries: 1,
197211
expectedFailedQueries: 0, // Not interesting.
198212
},
199213

200214
"unknown error": {
201-
returnedError: errors.New("test error"),
215+
returnedError: WrapQueryableErrors(errors.New("test error")),
216+
expectedError: errors.New("test error"),
202217
expectedQueries: 1,
203218
expectedFailedQueries: 1, // unknown errors are not 400, so they are reported.
204219
},
@@ -208,12 +223,12 @@ func TestMetricsQueryFuncErrors(t *testing.T) {
208223
failures := prometheus.NewCounter(prometheus.CounterOpts{})
209224

210225
mockFunc := func(ctx context.Context, q string, t time.Time) (promql.Vector, error) {
211-
return promql.Vector{}, WrapQueryableErrors(tc.returnedError)
226+
return promql.Vector{}, tc.returnedError
212227
}
213228
qf := MetricsQueryFunc(mockFunc, queries, failures)
214229

215230
_, err := qf(context.Background(), "test", time.Now())
216-
require.Equal(t, tc.returnedError, err)
231+
require.Equal(t, tc.expectedError, err)
217232

218233
require.Equal(t, tc.expectedQueries, int(testutil.ToFloat64(queries)))
219234
require.Equal(t, tc.expectedFailedQueries, int(testutil.ToFloat64(failures)))

0 commit comments

Comments
 (0)