From f07838f8845f7ad8d508557ddbbecf8c8cefde3b Mon Sep 17 00:00:00 2001 From: Rudrakh Panigrahi Date: Wed, 1 Nov 2023 16:36:27 +0530 Subject: [PATCH] added a counter metric for errors Signed-off-by: Rudrakh Panigrahi --- internal/errors.go | 66 +++++++++++++++++++++++++++++++++++ internal/internal.go | 73 +++++++++++++++++++++++++++++++-------- internal/internal_test.go | 59 ++++++++++++++++++++++++++----- 3 files changed, 175 insertions(+), 23 deletions(-) create mode 100644 internal/errors.go diff --git a/internal/errors.go b/internal/errors.go new file mode 100644 index 000000000..494a3f446 --- /dev/null +++ b/internal/errors.go @@ -0,0 +1,66 @@ +package internal + +import ( + "errors" + "fmt" +) + +// Error is the error type returned by the internal check function +type Error struct { + Code string `json:"code"` + err error `json:"-"` +} + +const ( + + // StartCheckErr error code returned when unable to start new execution + StartCheckErr string = "start_check_error" + + // StartTxnErr error code returned when unable to start new storage transaction + StartTxnErr string = "start_txn_error" + + // RequestParseErr error code returned when unable to parse protobuf request to input map + RequestParseErr string = "request_parse_error" + + // CheckRequestTimeoutErr error code returned when context deadline exceeds before eval + CheckRequestTimeoutErr string = "check_request_timeout" + + // InputParseErr error code returned when unable to convert input map to ast value + InputParseErr string = "input_parse_error" + + // EnvoyAuthEvalErr error code returned when auth eval fails + EnvoyAuthEvalErr string = "envoyauth_eval_error" + + // EnvoyAuthResultErr error code returned when error in fetching result from auth eval + EnvoyAuthResultErr string = "envoyauth_result_error" +) + +// Is allows matching internal errors using errors.Is +func (e *Error) Is(target error) bool { + var t *Error + if errors.As(target, &t) { + return (t.Code == "" || e.Code == t.Code) && errors.Is(e.Unwrap(), t.Unwrap()) + } + return false +} + +// Error allows converting internal Error to string type +func (e *Error) Error() string { + msg := fmt.Sprintf("%v: %v", e.Code, e.Unwrap().Error()) + return msg +} + +// Wrap wraps error as an internal error +func (e *Error) Wrap(err error) *Error { + e.err = err + return e +} + +// Unwrap gets error wrapped in the internal error +func (e *Error) Unwrap() error { + return e.err +} + +func internalError(code string, err error) Error { + return Error{Code: code, err: err} +} diff --git a/internal/internal.go b/internal/internal.go index d49d776ff..4aff1f123 100644 --- a/internal/internal.go +++ b/internal/internal.go @@ -7,6 +7,7 @@ package internal import ( "context" "fmt" + "github.com/open-policy-agent/opa/topdown" "math" "net" "net/url" @@ -181,7 +182,13 @@ func New(m *plugins.Manager, cfg *Config) plugins.Plugin { }, }, []string{"handler"}) plugin.metricAuthzDuration = *histogramAuthzDuration + errorCounter := prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "error_counter", + Help: "A counter for errors", + }, []string{"reason"}) + plugin.metricErrorCounter = *errorCounter plugin.manager.PrometheusRegister().MustRegister(histogramAuthzDuration) + plugin.manager.PrometheusRegister().MustRegister(errorCounter) } m.UpdatePluginStatus(PluginName, &plugins.Status{State: plugins.StateNotReady}) @@ -214,6 +221,7 @@ type envoyExtAuthzGrpcServer struct { interQueryBuiltinCache iCache.InterQueryCache distributedTracingOpts tracing.Options metricAuthzDuration prometheus.HistogramVec + metricErrorCounter prometheus.CounterVec } type envoyExtAuthzV2Wrapper struct { @@ -345,25 +353,32 @@ func (p *envoyExtAuthzGrpcServer) Check(ctx context.Context, req *ext_authz_v3.C if code := stop(); resp != nil && code != nil { resp.Status = code } - return resp, err + + if err != nil { + return resp, err.Unwrap() + } + return resp, nil } -func (p *envoyExtAuthzGrpcServer) check(ctx context.Context, req interface{}) (*ext_authz_v3.CheckResponse, func() *rpc_status.Status, error) { +func (p *envoyExtAuthzGrpcServer) check(ctx context.Context, req interface{}) (*ext_authz_v3.CheckResponse, func() *rpc_status.Status, *Error) { var err error var evalErr error + var internalErr Error start := time.Now() logger := p.manager.Logger() result, stopeval, err := envoyauth.NewEvalResult() if err != nil { logger.WithFields(map[string]interface{}{"err": err}).Error("Unable to start new evaluation.") - return nil, func() *rpc_status.Status { return nil }, err + internalErr = internalError(StartCheckErr, err) + return nil, func() *rpc_status.Status { return nil }, &internalErr } txn, txnClose, err := result.GetTxn(ctx, p.Store()) if err != nil { logger.WithFields(map[string]interface{}{"err": err}).Error("Unable to start new storage transaction.") - return nil, func() *rpc_status.Status { return nil }, err + internalErr = internalError(StartTxnErr, err) + return nil, func() *rpc_status.Status { return nil }, &internalErr } result.Txn = txn @@ -374,9 +389,21 @@ func (p *envoyExtAuthzGrpcServer) check(ctx context.Context, req interface{}) (* stop := func() *rpc_status.Status { stopeval() + if internalErr.Unwrap() != nil || internalErr.Code != "" { + var topdownError *topdown.Error + if errors.As(internalErr.Unwrap(), &topdownError) { + p.metricErrorCounter.With(prometheus.Labels{"reason": topdownError.Code}).Inc() + } else if internalErr.Code != "" { + p.metricErrorCounter.With(prometheus.Labels{"reason": internalErr.Code}).Inc() + } else { + p.metricErrorCounter.With(prometheus.Labels{"reason": "unknown_check_error"}).Inc() + } + } logErr := p.log(ctx, input, result, err) if logErr != nil { _ = txnClose(ctx, logErr) // Ignore error + p.Logger().Debug("Error when logging event: %v", logErr) + p.metricErrorCounter.With(prometheus.Labels{"reason": "unknown_log_error"}).Inc() return &rpc_status.Status{ Code: int32(code.Code_UNKNOWN), Message: logErr.Error(), @@ -388,23 +415,27 @@ func (p *envoyExtAuthzGrpcServer) check(ctx context.Context, req interface{}) (* input, err = envoyauth.RequestToInput(req, logger, p.cfg.protoSet, p.cfg.SkipRequestBodyParse) if err != nil { - return nil, stop, err + internalErr = internalError(RequestParseErr, err) + return nil, stop, &internalErr } if ctx.Err() != nil { err = errors.Wrap(ctx.Err(), "check request timed out before query execution") - return nil, stop, err + internalErr = internalError(CheckRequestTimeoutErr, err) + return nil, stop, &internalErr } var inputValue ast.Value inputValue, err = ast.InterfaceToValue(input) if err != nil { - return nil, stop, err + internalErr = internalError(InputParseErr, err) + return nil, stop, &internalErr } if err = envoyauth.Eval(ctx, p, inputValue, result); err != nil { evalErr = err - return nil, stop, err + internalErr = internalError(EnvoyAuthEvalErr, err) + return nil, stop, &internalErr } resp := &ext_authz_v3.CheckResponse{} @@ -412,7 +443,9 @@ func (p *envoyExtAuthzGrpcServer) check(ctx context.Context, req interface{}) (* var allowed bool allowed, err = result.IsAllowed() if err != nil { - return nil, stop, errors.Wrap(err, "failed to get response status") + err = errors.Wrap(err, "failed to get response status") + internalErr = internalError(EnvoyAuthResultErr, err) + return nil, stop, &internalErr } status := int32(code.Code_PERMISSION_DENIED) @@ -426,7 +459,9 @@ func (p *envoyExtAuthzGrpcServer) check(ctx context.Context, req interface{}) (* var responseHeaders []*ext_core_v3.HeaderValueOption responseHeaders, err = result.GetResponseEnvoyHeaderValueOptions() if err != nil { - return nil, stop, errors.Wrap(err, "failed to get response headers") + err = errors.Wrap(err, "failed to get response headers") + internalErr = internalError(EnvoyAuthResultErr, err) + return nil, stop, &internalErr } if status == int32(code.Code_OK) { @@ -434,13 +469,17 @@ func (p *envoyExtAuthzGrpcServer) check(ctx context.Context, req interface{}) (* var headersToRemove []string headersToRemove, err = result.GetRequestHTTPHeadersToRemove() if err != nil { - return nil, stop, errors.Wrap(err, "failed to get request headers to remove") + err = errors.Wrap(err, "failed to get request headers to remove") + internalErr = internalError(EnvoyAuthResultErr, err) + return nil, stop, &internalErr } var responseHeadersToAdd []*ext_core_v3.HeaderValueOption responseHeadersToAdd, err = result.GetResponseHTTPHeadersToAdd() if err != nil { - return nil, stop, errors.Wrap(err, "failed to get response headers to send to client") + err = errors.Wrap(err, "failed to get response headers to send to client") + internalErr = internalError(EnvoyAuthResultErr, err) + return nil, stop, &internalErr } resp.HttpResponse = &ext_authz_v3.CheckResponse_OkResponse{ @@ -454,13 +493,17 @@ func (p *envoyExtAuthzGrpcServer) check(ctx context.Context, req interface{}) (* var body string body, err = result.GetResponseBody() if err != nil { - return nil, stop, errors.Wrap(err, "failed to get response body") + err = errors.Wrap(err, "failed to get response body") + internalErr = internalError(EnvoyAuthResultErr, err) + return nil, stop, &internalErr } var httpStatus *ext_type_v3.HttpStatus httpStatus, err = result.GetResponseEnvoyHTTPStatus() if err != nil { - return nil, stop, errors.Wrap(err, "failed to get response http status") + err = errors.Wrap(err, "failed to get response http status") + internalErr = internalError(EnvoyAuthResultErr, err) + return nil, stop, &internalErr } deniedResponse := &ext_authz_v3.DeniedHttpResponse{ @@ -577,7 +620,7 @@ func (p *envoyExtAuthzV2Wrapper) Check(ctx context.Context, req *ext_authz_v2.Ch }() if err != nil { - return nil, err + return nil, err.Unwrap() } respV2 = v2Response(respV3) return respV2, nil diff --git a/internal/internal_test.go b/internal/internal_test.go index f02e1e786..ede35def0 100644 --- a/internal/internal_test.go +++ b/internal/internal_test.go @@ -491,7 +491,7 @@ func TestCheckContextTimeout(t *testing.T) { // create custom logger customLogger := &testPlugin{} - server := testAuthzServer(nil, withCustomLogger(customLogger)) + server := testAuthzServer(&Config{EnablePerformanceMetrics: true}, withCustomLogger(customLogger)) ctx, cancel := context.WithTimeout(context.Background(), time.Nanosecond*1) defer cancel() @@ -524,6 +524,8 @@ func TestCheckContextTimeout(t *testing.T) { if len((*event.Input).(map[string]interface{})) == 0 { t.Fatalf("Expected non empty input but got %v", *event.Input) } + + assertErrorCounterMetric(t, server, "check_request_timeout") } func TestCheckIllegalDecisionWithLogger(t *testing.T) { @@ -543,7 +545,7 @@ func TestCheckIllegalDecisionWithLogger(t *testing.T) { default allow = 1 ` - server := testAuthzServerWithModule(module, "envoy/authz/allow", nil, withCustomLogger(customLogger)) + server := testAuthzServerWithModule(module, "envoy/authz/allow", &Config{EnablePerformanceMetrics: true}, withCustomLogger(customLogger)) ctx := context.Background() output, err := server.Check(ctx, &req) if err == nil { @@ -569,6 +571,8 @@ func TestCheckIllegalDecisionWithLogger(t *testing.T) { event.DecisionID == "" || event.Metrics == nil { t.Fatalf("Unexpected events: %+v", customLogger.events) } + + assertErrorCounterMetric(t, server, "envoyauth_result_error") } func TestCheckDenyDecisionTruncatedBodyWithLogger(t *testing.T) { @@ -786,7 +790,7 @@ func TestCheckBadDecisionWithLogger(t *testing.T) { // create custom logger customLogger := &testPlugin{} - server := testAuthzServer(nil, withCustomLogger(customLogger)) + server := testAuthzServer(&Config{EnablePerformanceMetrics: true}, withCustomLogger(customLogger)) ctx := context.Background() output, err := server.Check(ctx, &req) @@ -808,6 +812,8 @@ func TestCheckBadDecisionWithLogger(t *testing.T) { event.DecisionID == "" || event.Metrics == nil { t.Fatalf("Unexpected events: %+v", customLogger.events) } + + assertErrorCounterMetric(t, server, "request_parse_error") } func TestCheckEvalErrorWithLogger(t *testing.T) { @@ -826,7 +832,7 @@ func TestCheckEvalErrorWithLogger(t *testing.T) { allow:= true` - server := testAuthzServerWithModule(module, "envoy/authz/allow", nil, withCustomLogger(customLogger)) + server := testAuthzServerWithModule(module, "envoy/authz/allow", &Config{EnablePerformanceMetrics: true}, withCustomLogger(customLogger)) ctx := context.Background() output, err := server.Check(ctx, &req) @@ -853,6 +859,8 @@ func TestCheckEvalErrorWithLogger(t *testing.T) { if !strings.Contains(event.Error.Error(), expectedMsg) { t.Fatalf("Expected error message %v, but got %v", expectedMsg, event.Error.Error()) } + + assertErrorCounterMetric(t, server, topdown.ConflictErr) } func TestCheckAllowObjectDecisionWithBadReqHeadersToRemoveWithLogger(t *testing.T) { @@ -881,7 +889,7 @@ func TestCheckAllowObjectDecisionWithBadReqHeadersToRemoveWithLogger(t *testing. customLogger := &testPlugin{} - server := testAuthzServerWithModule(module, "envoy/authz/result", nil, withCustomLogger(customLogger)) + server := testAuthzServerWithModule(module, "envoy/authz/result", &Config{EnablePerformanceMetrics: true}, withCustomLogger(customLogger)) ctx := context.Background() output, err := server.Check(ctx, &req) if err == nil { @@ -907,6 +915,8 @@ func TestCheckAllowObjectDecisionWithBadReqHeadersToRemoveWithLogger(t *testing. if !strings.Contains(event.Error.Error(), expectedMsg) { t.Fatalf("Expected error message %v, but got %v", expectedMsg, event.Error.Error()) } + + assertErrorCounterMetric(t, server, "envoyauth_result_error") } func TestCheckWithLoggerError(t *testing.T) { @@ -921,7 +931,7 @@ func TestCheckWithLoggerError(t *testing.T) { // create custom logger customLogger := &testPluginError{} - server := testAuthzServer(nil, withCustomLogger(customLogger)) + server := testAuthzServer(&Config{EnablePerformanceMetrics: true}, withCustomLogger(customLogger)) ctx := context.Background() output, err := server.Check(ctx, &req) if err != nil { @@ -935,6 +945,8 @@ func TestCheckWithLoggerError(t *testing.T) { if output.Status.Message != expectedMsg { t.Fatalf("Expected error message %v, but got %v", expectedMsg, output.Status.Message) } + + assertErrorCounterMetric(t, server, "unknown_log_error") } // Some decision log related tests are replicated for envoy.service.auth.v2.Authorization/Check @@ -945,7 +957,7 @@ func TestCheckWithLoggerErrorV2(t *testing.T) { panic(err) } - server := envoyExtAuthzV2Wrapper{testAuthzServer(nil, withCustomLogger(&testPluginError{}))} + server := envoyExtAuthzV2Wrapper{testAuthzServer(&Config{EnablePerformanceMetrics: true}, withCustomLogger(&testPluginError{}))} ctx := context.Background() output, err := server.Check(ctx, &req) if err != nil { @@ -959,6 +971,8 @@ func TestCheckWithLoggerErrorV2(t *testing.T) { if output.Status.Message != expectedMsg { t.Fatalf("Expected error message %v, but got %v", expectedMsg, output.Status.Message) } + + assertErrorCounterMetric(t, server.v3, "unknown_log_error") } func TestCheckBadDecisionWithLoggerV2(t *testing.T) { @@ -970,7 +984,7 @@ func TestCheckBadDecisionWithLoggerV2(t *testing.T) { // create custom logger customLogger := &testPlugin{} - server := envoyExtAuthzV2Wrapper{testAuthzServer(nil, withCustomLogger(customLogger))} + server := envoyExtAuthzV2Wrapper{testAuthzServer(&Config{EnablePerformanceMetrics: true}, withCustomLogger(customLogger))} ctx := context.Background() output, err := server.Check(ctx, &req) @@ -992,6 +1006,8 @@ func TestCheckBadDecisionWithLoggerV2(t *testing.T) { event.DecisionID == "" || event.Metrics == nil { t.Fatalf("Unexpected events: %+v", customLogger.events) } + + assertErrorCounterMetric(t, server.v3, "request_parse_error") } func TestCheckDenyWithLoggerV2(t *testing.T) { @@ -1973,6 +1989,33 @@ func assertHeaderValues(t *testing.T, expectedHeaders http.Header, headersToAdd } } +func assertErrorCounterMetric(t *testing.T, server *envoyExtAuthzGrpcServer, labelValues ...string) { + reg := prometheus.NewPedanticRegistry() + if err := reg.Register(server.metricErrorCounter); err != nil { + t.Fatalf("registering collector failed: %v", err) + } + + g := prometheus.ToTransactionalGatherer(reg) + fam, _, err := g.Gather() + if err != nil { + t.Fatalf("gathering metrics failed: %v", err) + } + if len(fam) != 1 { + t.Fatalf("Expected 1 metric, got %d", len(fam)) + } + if fam[0].Metric[0].Counter.GetValue() != 1 { + t.Fatalf("Expected counter value 1, got %v", fam[0].Metric[0].Counter.GetValue()) + } + if len(fam[0].Metric[0].GetLabel()) != len(labelValues) { + t.Fatalf("Expected 1 counter metric label, got %v", fam[0].Metric[0].GetLabel()) + } + for labelIndex, labelValue := range labelValues { + if fam[0].Metric[0].GetLabel()[labelIndex].GetValue() != labelValue { + t.Fatalf("Expected error metric with reason label %v, got %v", labelValue, fam[0].Metric[0].GetLabel()[labelIndex].GetValue()) + } + } +} + type testPlugin struct { events []logs.EventV1 }