diff --git a/internal/internal.go b/internal/internal.go index d49d776ff..6a222069a 100644 --- a/internal/internal.go +++ b/internal/internal.go @@ -7,6 +7,7 @@ package internal import ( "context" "fmt" + "github.com/open-policy-agent/opa/topdown" "math" "net" "net/url" @@ -66,6 +67,8 @@ const ( // PluginName is the name to register with the OPA plugin manager PluginName = "envoy_ext_authz_grpc" + + checkRequestTimeoutErrorMessage = "check request timed out before query execution" ) // Validate receives a slice of bytes representing the plugin's @@ -181,7 +184,13 @@ func New(m *plugins.Manager, cfg *Config) plugins.Plugin { }, }, []string{"handler"}) plugin.metricAuthzDuration = *histogramAuthzDuration + errorCounter := prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "error_counter", + Help: "A counter for errors", + }, []string{"reason"}) + plugin.metricErrorCounter = *errorCounter plugin.manager.PrometheusRegister().MustRegister(histogramAuthzDuration) + plugin.manager.PrometheusRegister().MustRegister(errorCounter) } m.UpdatePluginStatus(PluginName, &plugins.Status{State: plugins.StateNotReady}) @@ -214,6 +223,7 @@ type envoyExtAuthzGrpcServer struct { interQueryBuiltinCache iCache.InterQueryCache distributedTracingOpts tracing.Options metricAuthzDuration prometheus.HistogramVec + metricErrorCounter prometheus.CounterVec } type envoyExtAuthzV2Wrapper struct { @@ -374,9 +384,20 @@ func (p *envoyExtAuthzGrpcServer) check(ctx context.Context, req interface{}) (* stop := func() *rpc_status.Status { stopeval() + if err != nil { + var topdownError *topdown.Error + if errors.As(err, &topdownError) { + p.metricErrorCounter.With(prometheus.Labels{"reason": topdownError.Code}).Inc() + } else if strings.Contains(err.Error(), checkRequestTimeoutErrorMessage) { + p.metricErrorCounter.With(prometheus.Labels{"reason": "check_request_timeout"}).Inc() + } else { + p.metricErrorCounter.With(prometheus.Labels{"reason": "eval_unknown_error"}).Inc() + } + } logErr := p.log(ctx, input, result, err) if logErr != nil { _ = txnClose(ctx, logErr) // Ignore error + p.metricErrorCounter.With(prometheus.Labels{"reason": "log_unknown_error"}).Inc() return &rpc_status.Status{ Code: int32(code.Code_UNKNOWN), Message: logErr.Error(), @@ -392,7 +413,7 @@ func (p *envoyExtAuthzGrpcServer) check(ctx context.Context, req interface{}) (* } if ctx.Err() != nil { - err = errors.Wrap(ctx.Err(), "check request timed out before query execution") + err = errors.Wrap(ctx.Err(), checkRequestTimeoutErrorMessage) return nil, stop, err } diff --git a/internal/internal_test.go b/internal/internal_test.go index f02e1e786..b29e1bb9e 100644 --- a/internal/internal_test.go +++ b/internal/internal_test.go @@ -491,7 +491,7 @@ func TestCheckContextTimeout(t *testing.T) { // create custom logger customLogger := &testPlugin{} - server := testAuthzServer(nil, withCustomLogger(customLogger)) + server := testAuthzServer(&Config{EnablePerformanceMetrics: true}, withCustomLogger(customLogger)) ctx, cancel := context.WithTimeout(context.Background(), time.Nanosecond*1) defer cancel() @@ -524,6 +524,29 @@ func TestCheckContextTimeout(t *testing.T) { if len((*event.Input).(map[string]interface{})) == 0 { t.Fatalf("Expected non empty input but got %v", *event.Input) } + + reg := prometheus.NewPedanticRegistry() + if err := reg.Register(server.metricErrorCounter); err != nil { + panic(fmt.Errorf("registering collector failed: %w", err)) + } + + g := prometheus.ToTransactionalGatherer(reg) + fam, _, err := g.Gather() + if err != nil { + panic(err) + } + if len(fam) != 1 { + t.Fatalf("Expected 1 metric, got %d", len(fam)) + } + if fam[0].Metric[0].Counter.GetValue() != 1 { + t.Fatalf("Expected counter value 1, got %v", fam[0].Metric[0].Counter.GetValue()) + } + if len(fam[0].Metric[0].GetLabel()) != 1 { + t.Fatalf("Expected 1 counter metric label, got %v", fam[0].Metric[0].GetLabel()) + } + if fam[0].Metric[0].GetLabel()[0].GetValue() != "check_request_timeout" { + t.Fatalf("Expected error metric with reason label check_request_timeout, got %v", fam[0].Metric[0].GetLabel()[0].GetValue()) + } } func TestCheckIllegalDecisionWithLogger(t *testing.T) { @@ -543,7 +566,7 @@ func TestCheckIllegalDecisionWithLogger(t *testing.T) { default allow = 1 ` - server := testAuthzServerWithModule(module, "envoy/authz/allow", nil, withCustomLogger(customLogger)) + server := testAuthzServerWithModule(module, "envoy/authz/allow", &Config{EnablePerformanceMetrics: true}, withCustomLogger(customLogger)) ctx := context.Background() output, err := server.Check(ctx, &req) if err == nil { @@ -569,6 +592,29 @@ func TestCheckIllegalDecisionWithLogger(t *testing.T) { event.DecisionID == "" || event.Metrics == nil { t.Fatalf("Unexpected events: %+v", customLogger.events) } + + reg := prometheus.NewPedanticRegistry() + if err := reg.Register(server.metricErrorCounter); err != nil { + panic(fmt.Errorf("registering collector failed: %w", err)) + } + + g := prometheus.ToTransactionalGatherer(reg) + fam, _, err := g.Gather() + if err != nil { + panic(err) + } + if len(fam) != 1 { + t.Fatalf("Expected 1 metric, got %d", len(fam)) + } + if fam[0].Metric[0].Counter.GetValue() != 1 { + t.Fatalf("Expected counter value 1, got %v", fam[0].Metric[0].Counter.GetValue()) + } + if len(fam[0].Metric[0].GetLabel()) != 1 { + t.Fatalf("Expected 1 counter metric label, got %v", fam[0].Metric[0].GetLabel()) + } + if fam[0].Metric[0].GetLabel()[0].GetValue() != "eval_unknown_error" { + t.Fatalf("Expected error metric with reason label eval_unknown_error, got %v", fam[0].Metric[0].GetLabel()[0].GetValue()) + } } func TestCheckDenyDecisionTruncatedBodyWithLogger(t *testing.T) { @@ -786,7 +832,7 @@ func TestCheckBadDecisionWithLogger(t *testing.T) { // create custom logger customLogger := &testPlugin{} - server := testAuthzServer(nil, withCustomLogger(customLogger)) + server := testAuthzServer(&Config{EnablePerformanceMetrics: true}, withCustomLogger(customLogger)) ctx := context.Background() output, err := server.Check(ctx, &req) @@ -808,6 +854,29 @@ func TestCheckBadDecisionWithLogger(t *testing.T) { event.DecisionID == "" || event.Metrics == nil { t.Fatalf("Unexpected events: %+v", customLogger.events) } + + reg := prometheus.NewPedanticRegistry() + if err := reg.Register(server.metricErrorCounter); err != nil { + panic(fmt.Errorf("registering collector failed: %w", err)) + } + + g := prometheus.ToTransactionalGatherer(reg) + fam, _, err := g.Gather() + if err != nil { + panic(err) + } + if len(fam) != 1 { + t.Fatalf("Expected 1 metric, got %d", len(fam)) + } + if fam[0].Metric[0].Counter.GetValue() != 1 { + t.Fatalf("Expected counter value 1, got %v", fam[0].Metric[0].Counter.GetValue()) + } + if len(fam[0].Metric[0].GetLabel()) != 1 { + t.Fatalf("Expected 1 counter metric label, got %v", fam[0].Metric[0].GetLabel()) + } + if fam[0].Metric[0].GetLabel()[0].GetValue() != "eval_unknown_error" { + t.Fatalf("Expected error metric with reason label eval_unknown_error, got %v", fam[0].Metric[0].GetLabel()[0].GetValue()) + } } func TestCheckEvalErrorWithLogger(t *testing.T) { @@ -826,7 +895,7 @@ func TestCheckEvalErrorWithLogger(t *testing.T) { allow:= true` - server := testAuthzServerWithModule(module, "envoy/authz/allow", nil, withCustomLogger(customLogger)) + server := testAuthzServerWithModule(module, "envoy/authz/allow", &Config{EnablePerformanceMetrics: true}, withCustomLogger(customLogger)) ctx := context.Background() output, err := server.Check(ctx, &req) @@ -853,6 +922,29 @@ func TestCheckEvalErrorWithLogger(t *testing.T) { if !strings.Contains(event.Error.Error(), expectedMsg) { t.Fatalf("Expected error message %v, but got %v", expectedMsg, event.Error.Error()) } + + reg := prometheus.NewPedanticRegistry() + if err := reg.Register(server.metricErrorCounter); err != nil { + panic(fmt.Errorf("registering collector failed: %w", err)) + } + + g := prometheus.ToTransactionalGatherer(reg) + fam, _, err := g.Gather() + if err != nil { + panic(err) + } + if len(fam) != 1 { + t.Fatalf("Expected 1 metric, got %d", len(fam)) + } + if fam[0].Metric[0].Counter.GetValue() != 1 { + t.Fatalf("Expected counter value 1, got %v", fam[0].Metric[0].Counter.GetValue()) + } + if len(fam[0].Metric[0].GetLabel()) != 1 { + t.Fatalf("Expected 1 counter metric label, got %v", fam[0].Metric[0].GetLabel()) + } + if fam[0].Metric[0].GetLabel()[0].GetValue() != topdown.ConflictErr { + t.Fatalf("Expected error metric with reason label %v, got %v", topdown.ConflictErr, fam[0].Metric[0].GetLabel()[0].GetValue()) + } } func TestCheckAllowObjectDecisionWithBadReqHeadersToRemoveWithLogger(t *testing.T) { @@ -881,7 +973,7 @@ func TestCheckAllowObjectDecisionWithBadReqHeadersToRemoveWithLogger(t *testing. customLogger := &testPlugin{} - server := testAuthzServerWithModule(module, "envoy/authz/result", nil, withCustomLogger(customLogger)) + server := testAuthzServerWithModule(module, "envoy/authz/result", &Config{EnablePerformanceMetrics: true}, withCustomLogger(customLogger)) ctx := context.Background() output, err := server.Check(ctx, &req) if err == nil { @@ -907,6 +999,29 @@ func TestCheckAllowObjectDecisionWithBadReqHeadersToRemoveWithLogger(t *testing. if !strings.Contains(event.Error.Error(), expectedMsg) { t.Fatalf("Expected error message %v, but got %v", expectedMsg, event.Error.Error()) } + + reg := prometheus.NewPedanticRegistry() + if err := reg.Register(server.metricErrorCounter); err != nil { + panic(fmt.Errorf("registering collector failed: %w", err)) + } + + g := prometheus.ToTransactionalGatherer(reg) + fam, _, err := g.Gather() + if err != nil { + panic(err) + } + if len(fam) != 1 { + t.Fatalf("Expected 1 metric, got %d", len(fam)) + } + if fam[0].Metric[0].Counter.GetValue() != 1 { + t.Fatalf("Expected counter value 1, got %v", fam[0].Metric[0].Counter.GetValue()) + } + if len(fam[0].Metric[0].GetLabel()) != 1 { + t.Fatalf("Expected 1 counter metric label, got %v", fam[0].Metric[0].GetLabel()) + } + if fam[0].Metric[0].GetLabel()[0].GetValue() != "eval_unknown_error" { + t.Fatalf("Expected error metric with reason label eval_unknown_error, got %v", fam[0].Metric[0].GetLabel()[0].GetValue()) + } } func TestCheckWithLoggerError(t *testing.T) { @@ -921,7 +1036,7 @@ func TestCheckWithLoggerError(t *testing.T) { // create custom logger customLogger := &testPluginError{} - server := testAuthzServer(nil, withCustomLogger(customLogger)) + server := testAuthzServer(&Config{EnablePerformanceMetrics: true}, withCustomLogger(customLogger)) ctx := context.Background() output, err := server.Check(ctx, &req) if err != nil { @@ -935,6 +1050,29 @@ func TestCheckWithLoggerError(t *testing.T) { if output.Status.Message != expectedMsg { t.Fatalf("Expected error message %v, but got %v", expectedMsg, output.Status.Message) } + + reg := prometheus.NewPedanticRegistry() + if err := reg.Register(server.metricErrorCounter); err != nil { + panic(fmt.Errorf("registering collector failed: %w", err)) + } + + g := prometheus.ToTransactionalGatherer(reg) + fam, _, err := g.Gather() + if err != nil { + panic(err) + } + if len(fam) != 1 { + t.Fatalf("Expected 1 metric, got %d", len(fam)) + } + if fam[0].Metric[0].Counter.GetValue() != 1 { + t.Fatalf("Expected counter value 1, got %v", fam[0].Metric[0].Counter.GetValue()) + } + if len(fam[0].Metric[0].GetLabel()) != 1 { + t.Fatalf("Expected 1 counter metric label, got %v", fam[0].Metric[0].GetLabel()) + } + if fam[0].Metric[0].GetLabel()[0].GetValue() != "log_unknown_error" { + t.Fatalf("Expected error metric with reason label log_unknown_error, got %v", fam[0].Metric[0].GetLabel()[0].GetValue()) + } } // Some decision log related tests are replicated for envoy.service.auth.v2.Authorization/Check @@ -945,7 +1083,7 @@ func TestCheckWithLoggerErrorV2(t *testing.T) { panic(err) } - server := envoyExtAuthzV2Wrapper{testAuthzServer(nil, withCustomLogger(&testPluginError{}))} + server := envoyExtAuthzV2Wrapper{testAuthzServer(&Config{EnablePerformanceMetrics: true}, withCustomLogger(&testPluginError{}))} ctx := context.Background() output, err := server.Check(ctx, &req) if err != nil { @@ -959,6 +1097,29 @@ func TestCheckWithLoggerErrorV2(t *testing.T) { if output.Status.Message != expectedMsg { t.Fatalf("Expected error message %v, but got %v", expectedMsg, output.Status.Message) } + + reg := prometheus.NewPedanticRegistry() + if err := reg.Register(server.v3.metricErrorCounter); err != nil { + panic(fmt.Errorf("registering collector failed: %w", err)) + } + + g := prometheus.ToTransactionalGatherer(reg) + fam, _, err := g.Gather() + if err != nil { + panic(err) + } + if len(fam) != 1 { + t.Fatalf("Expected 1 metric, got %d", len(fam)) + } + if fam[0].Metric[0].Counter.GetValue() != 1 { + t.Fatalf("Expected counter value 1, got %v", fam[0].Metric[0].Counter.GetValue()) + } + if len(fam[0].Metric[0].GetLabel()) != 1 { + t.Fatalf("Expected 1 counter metric label, got %v", fam[0].Metric[0].GetLabel()) + } + if fam[0].Metric[0].GetLabel()[0].GetValue() != "log_unknown_error" { + t.Fatalf("Expected error metric with reason label log_unknown_error, got %v", fam[0].Metric[0].GetLabel()[0].GetValue()) + } } func TestCheckBadDecisionWithLoggerV2(t *testing.T) { @@ -970,7 +1131,7 @@ func TestCheckBadDecisionWithLoggerV2(t *testing.T) { // create custom logger customLogger := &testPlugin{} - server := envoyExtAuthzV2Wrapper{testAuthzServer(nil, withCustomLogger(customLogger))} + server := envoyExtAuthzV2Wrapper{testAuthzServer(&Config{EnablePerformanceMetrics: true}, withCustomLogger(customLogger))} ctx := context.Background() output, err := server.Check(ctx, &req) @@ -992,6 +1153,29 @@ func TestCheckBadDecisionWithLoggerV2(t *testing.T) { event.DecisionID == "" || event.Metrics == nil { t.Fatalf("Unexpected events: %+v", customLogger.events) } + + reg := prometheus.NewPedanticRegistry() + if err := reg.Register(server.v3.metricErrorCounter); err != nil { + panic(fmt.Errorf("registering collector failed: %w", err)) + } + + g := prometheus.ToTransactionalGatherer(reg) + fam, _, err := g.Gather() + if err != nil { + panic(err) + } + if len(fam) != 1 { + t.Fatalf("Expected 1 metric, got %d", len(fam)) + } + if fam[0].Metric[0].Counter.GetValue() != 1 { + t.Fatalf("Expected counter value 1, got %v", fam[0].Metric[0].Counter.GetValue()) + } + if len(fam[0].Metric[0].GetLabel()) != 1 { + t.Fatalf("Expected 1 counter metric label, got %v", fam[0].Metric[0].GetLabel()) + } + if fam[0].Metric[0].GetLabel()[0].GetValue() != "eval_unknown_error" { + t.Fatalf("Expected error metric with reason label eval_unknown_error, got %v", fam[0].Metric[0].GetLabel()[0].GetValue()) + } } func TestCheckDenyWithLoggerV2(t *testing.T) {