Skip to content

Commit

Permalink
added a counter metric for errors
Browse files Browse the repository at this point in the history
Signed-off-by: Rudrakh Panigrahi <[email protected]>
  • Loading branch information
rudrakhp committed Nov 1, 2023
1 parent 119fc3f commit 3dc3bac
Show file tree
Hide file tree
Showing 2 changed files with 214 additions and 9 deletions.
23 changes: 22 additions & 1 deletion internal/internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ package internal
import (
"context"
"fmt"
"github.com/open-policy-agent/opa/topdown"
"math"
"net"
"net/url"
Expand Down Expand Up @@ -66,6 +67,8 @@ const (

// PluginName is the name to register with the OPA plugin manager
PluginName = "envoy_ext_authz_grpc"

checkRequestTimeoutErrorMessage = "check request timed out before query execution"
)

// Validate receives a slice of bytes representing the plugin's
Expand Down Expand Up @@ -181,7 +184,13 @@ func New(m *plugins.Manager, cfg *Config) plugins.Plugin {
},
}, []string{"handler"})
plugin.metricAuthzDuration = *histogramAuthzDuration
errorCounter := prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "error_counter",
Help: "A counter for errors",
}, []string{"reason"})
plugin.metricErrorCounter = *errorCounter
plugin.manager.PrometheusRegister().MustRegister(histogramAuthzDuration)
plugin.manager.PrometheusRegister().MustRegister(errorCounter)
}

m.UpdatePluginStatus(PluginName, &plugins.Status{State: plugins.StateNotReady})
Expand Down Expand Up @@ -214,6 +223,7 @@ type envoyExtAuthzGrpcServer struct {
interQueryBuiltinCache iCache.InterQueryCache
distributedTracingOpts tracing.Options
metricAuthzDuration prometheus.HistogramVec
metricErrorCounter prometheus.CounterVec
}

type envoyExtAuthzV2Wrapper struct {
Expand Down Expand Up @@ -374,9 +384,20 @@ func (p *envoyExtAuthzGrpcServer) check(ctx context.Context, req interface{}) (*

stop := func() *rpc_status.Status {
stopeval()
if err != nil {
var topdownError *topdown.Error
if errors.As(err, &topdownError) {
p.metricErrorCounter.With(prometheus.Labels{"reason": topdownError.Code}).Inc()
} else if strings.Contains(err.Error(), checkRequestTimeoutErrorMessage) {
p.metricErrorCounter.With(prometheus.Labels{"reason": "check_request_timeout"}).Inc()
} else {
p.metricErrorCounter.With(prometheus.Labels{"reason": "eval_unknown_error"}).Inc()
}
}
logErr := p.log(ctx, input, result, err)
if logErr != nil {
_ = txnClose(ctx, logErr) // Ignore error
p.metricErrorCounter.With(prometheus.Labels{"reason": "log_unknown_error"}).Inc()
return &rpc_status.Status{
Code: int32(code.Code_UNKNOWN),
Message: logErr.Error(),
Expand All @@ -392,7 +413,7 @@ func (p *envoyExtAuthzGrpcServer) check(ctx context.Context, req interface{}) (*
}

if ctx.Err() != nil {
err = errors.Wrap(ctx.Err(), "check request timed out before query execution")
err = errors.Wrap(ctx.Err(), checkRequestTimeoutErrorMessage)
return nil, stop, err
}

Expand Down
200 changes: 192 additions & 8 deletions internal/internal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -491,7 +491,7 @@ func TestCheckContextTimeout(t *testing.T) {
// create custom logger
customLogger := &testPlugin{}

server := testAuthzServer(nil, withCustomLogger(customLogger))
server := testAuthzServer(&Config{EnablePerformanceMetrics: true}, withCustomLogger(customLogger))

ctx, cancel := context.WithTimeout(context.Background(), time.Nanosecond*1)
defer cancel()
Expand Down Expand Up @@ -524,6 +524,29 @@ func TestCheckContextTimeout(t *testing.T) {
if len((*event.Input).(map[string]interface{})) == 0 {
t.Fatalf("Expected non empty input but got %v", *event.Input)
}

reg := prometheus.NewPedanticRegistry()
if err := reg.Register(server.metricErrorCounter); err != nil {
panic(fmt.Errorf("registering collector failed: %w", err))
}

g := prometheus.ToTransactionalGatherer(reg)
fam, _, err := g.Gather()
if err != nil {
panic(err)
}
if len(fam) != 1 {
t.Fatalf("Expected 1 metric, got %d", len(fam))
}
if fam[0].Metric[0].Counter.GetValue() != 1 {
t.Fatalf("Expected counter value 1, got %v", fam[0].Metric[0].Counter.GetValue())
}
if len(fam[0].Metric[0].GetLabel()) != 1 {
t.Fatalf("Expected 1 counter metric label, got %v", fam[0].Metric[0].GetLabel())
}
if fam[0].Metric[0].GetLabel()[0].GetValue() != "check_request_timeout" {
t.Fatalf("Expected error metric with reason label check_request_timeout, got %v", fam[0].Metric[0].GetLabel()[0].GetValue())
}
}

func TestCheckIllegalDecisionWithLogger(t *testing.T) {
Expand All @@ -543,7 +566,7 @@ func TestCheckIllegalDecisionWithLogger(t *testing.T) {
default allow = 1
`
server := testAuthzServerWithModule(module, "envoy/authz/allow", nil, withCustomLogger(customLogger))
server := testAuthzServerWithModule(module, "envoy/authz/allow", &Config{EnablePerformanceMetrics: true}, withCustomLogger(customLogger))
ctx := context.Background()
output, err := server.Check(ctx, &req)
if err == nil {
Expand All @@ -569,6 +592,29 @@ func TestCheckIllegalDecisionWithLogger(t *testing.T) {
event.DecisionID == "" || event.Metrics == nil {
t.Fatalf("Unexpected events: %+v", customLogger.events)
}

reg := prometheus.NewPedanticRegistry()
if err := reg.Register(server.metricErrorCounter); err != nil {
panic(fmt.Errorf("registering collector failed: %w", err))
}

g := prometheus.ToTransactionalGatherer(reg)
fam, _, err := g.Gather()
if err != nil {
panic(err)
}
if len(fam) != 1 {
t.Fatalf("Expected 1 metric, got %d", len(fam))
}
if fam[0].Metric[0].Counter.GetValue() != 1 {
t.Fatalf("Expected counter value 1, got %v", fam[0].Metric[0].Counter.GetValue())
}
if len(fam[0].Metric[0].GetLabel()) != 1 {
t.Fatalf("Expected 1 counter metric label, got %v", fam[0].Metric[0].GetLabel())
}
if fam[0].Metric[0].GetLabel()[0].GetValue() != "eval_unknown_error" {
t.Fatalf("Expected error metric with reason label eval_unknown_error, got %v", fam[0].Metric[0].GetLabel()[0].GetValue())
}
}

func TestCheckDenyDecisionTruncatedBodyWithLogger(t *testing.T) {
Expand Down Expand Up @@ -786,7 +832,7 @@ func TestCheckBadDecisionWithLogger(t *testing.T) {
// create custom logger
customLogger := &testPlugin{}

server := testAuthzServer(nil, withCustomLogger(customLogger))
server := testAuthzServer(&Config{EnablePerformanceMetrics: true}, withCustomLogger(customLogger))
ctx := context.Background()
output, err := server.Check(ctx, &req)

Expand All @@ -808,6 +854,29 @@ func TestCheckBadDecisionWithLogger(t *testing.T) {
event.DecisionID == "" || event.Metrics == nil {
t.Fatalf("Unexpected events: %+v", customLogger.events)
}

reg := prometheus.NewPedanticRegistry()
if err := reg.Register(server.metricErrorCounter); err != nil {
panic(fmt.Errorf("registering collector failed: %w", err))
}

g := prometheus.ToTransactionalGatherer(reg)
fam, _, err := g.Gather()
if err != nil {
panic(err)
}
if len(fam) != 1 {
t.Fatalf("Expected 1 metric, got %d", len(fam))
}
if fam[0].Metric[0].Counter.GetValue() != 1 {
t.Fatalf("Expected counter value 1, got %v", fam[0].Metric[0].Counter.GetValue())
}
if len(fam[0].Metric[0].GetLabel()) != 1 {
t.Fatalf("Expected 1 counter metric label, got %v", fam[0].Metric[0].GetLabel())
}
if fam[0].Metric[0].GetLabel()[0].GetValue() != "eval_unknown_error" {
t.Fatalf("Expected error metric with reason label eval_unknown_error, got %v", fam[0].Metric[0].GetLabel()[0].GetValue())
}
}

func TestCheckEvalErrorWithLogger(t *testing.T) {
Expand All @@ -826,7 +895,7 @@ func TestCheckEvalErrorWithLogger(t *testing.T) {
allow:= true`

server := testAuthzServerWithModule(module, "envoy/authz/allow", nil, withCustomLogger(customLogger))
server := testAuthzServerWithModule(module, "envoy/authz/allow", &Config{EnablePerformanceMetrics: true}, withCustomLogger(customLogger))
ctx := context.Background()
output, err := server.Check(ctx, &req)

Expand All @@ -853,6 +922,29 @@ func TestCheckEvalErrorWithLogger(t *testing.T) {
if !strings.Contains(event.Error.Error(), expectedMsg) {
t.Fatalf("Expected error message %v, but got %v", expectedMsg, event.Error.Error())
}

reg := prometheus.NewPedanticRegistry()
if err := reg.Register(server.metricErrorCounter); err != nil {
panic(fmt.Errorf("registering collector failed: %w", err))
}

g := prometheus.ToTransactionalGatherer(reg)
fam, _, err := g.Gather()
if err != nil {
panic(err)
}
if len(fam) != 1 {
t.Fatalf("Expected 1 metric, got %d", len(fam))
}
if fam[0].Metric[0].Counter.GetValue() != 1 {
t.Fatalf("Expected counter value 1, got %v", fam[0].Metric[0].Counter.GetValue())
}
if len(fam[0].Metric[0].GetLabel()) != 1 {
t.Fatalf("Expected 1 counter metric label, got %v", fam[0].Metric[0].GetLabel())
}
if fam[0].Metric[0].GetLabel()[0].GetValue() != topdown.ConflictErr {
t.Fatalf("Expected error metric with reason label %v, got %v", topdown.ConflictErr, fam[0].Metric[0].GetLabel()[0].GetValue())
}
}

func TestCheckAllowObjectDecisionWithBadReqHeadersToRemoveWithLogger(t *testing.T) {
Expand Down Expand Up @@ -881,7 +973,7 @@ func TestCheckAllowObjectDecisionWithBadReqHeadersToRemoveWithLogger(t *testing.

customLogger := &testPlugin{}

server := testAuthzServerWithModule(module, "envoy/authz/result", nil, withCustomLogger(customLogger))
server := testAuthzServerWithModule(module, "envoy/authz/result", &Config{EnablePerformanceMetrics: true}, withCustomLogger(customLogger))
ctx := context.Background()
output, err := server.Check(ctx, &req)
if err == nil {
Expand All @@ -907,6 +999,29 @@ func TestCheckAllowObjectDecisionWithBadReqHeadersToRemoveWithLogger(t *testing.
if !strings.Contains(event.Error.Error(), expectedMsg) {
t.Fatalf("Expected error message %v, but got %v", expectedMsg, event.Error.Error())
}

reg := prometheus.NewPedanticRegistry()
if err := reg.Register(server.metricErrorCounter); err != nil {
panic(fmt.Errorf("registering collector failed: %w", err))
}

g := prometheus.ToTransactionalGatherer(reg)
fam, _, err := g.Gather()
if err != nil {
panic(err)
}
if len(fam) != 1 {
t.Fatalf("Expected 1 metric, got %d", len(fam))
}
if fam[0].Metric[0].Counter.GetValue() != 1 {
t.Fatalf("Expected counter value 1, got %v", fam[0].Metric[0].Counter.GetValue())
}
if len(fam[0].Metric[0].GetLabel()) != 1 {
t.Fatalf("Expected 1 counter metric label, got %v", fam[0].Metric[0].GetLabel())
}
if fam[0].Metric[0].GetLabel()[0].GetValue() != "eval_unknown_error" {
t.Fatalf("Expected error metric with reason label eval_unknown_error, got %v", fam[0].Metric[0].GetLabel()[0].GetValue())
}
}

func TestCheckWithLoggerError(t *testing.T) {
Expand All @@ -921,7 +1036,7 @@ func TestCheckWithLoggerError(t *testing.T) {
// create custom logger
customLogger := &testPluginError{}

server := testAuthzServer(nil, withCustomLogger(customLogger))
server := testAuthzServer(&Config{EnablePerformanceMetrics: true}, withCustomLogger(customLogger))
ctx := context.Background()
output, err := server.Check(ctx, &req)
if err != nil {
Expand All @@ -935,6 +1050,29 @@ func TestCheckWithLoggerError(t *testing.T) {
if output.Status.Message != expectedMsg {
t.Fatalf("Expected error message %v, but got %v", expectedMsg, output.Status.Message)
}

reg := prometheus.NewPedanticRegistry()
if err := reg.Register(server.metricErrorCounter); err != nil {
panic(fmt.Errorf("registering collector failed: %w", err))
}

g := prometheus.ToTransactionalGatherer(reg)
fam, _, err := g.Gather()
if err != nil {
panic(err)
}
if len(fam) != 1 {
t.Fatalf("Expected 1 metric, got %d", len(fam))
}
if fam[0].Metric[0].Counter.GetValue() != 1 {
t.Fatalf("Expected counter value 1, got %v", fam[0].Metric[0].Counter.GetValue())
}
if len(fam[0].Metric[0].GetLabel()) != 1 {
t.Fatalf("Expected 1 counter metric label, got %v", fam[0].Metric[0].GetLabel())
}
if fam[0].Metric[0].GetLabel()[0].GetValue() != "log_unknown_error" {
t.Fatalf("Expected error metric with reason label log_unknown_error, got %v", fam[0].Metric[0].GetLabel()[0].GetValue())
}
}

// Some decision log related tests are replicated for envoy.service.auth.v2.Authorization/Check
Expand All @@ -945,7 +1083,7 @@ func TestCheckWithLoggerErrorV2(t *testing.T) {
panic(err)
}

server := envoyExtAuthzV2Wrapper{testAuthzServer(nil, withCustomLogger(&testPluginError{}))}
server := envoyExtAuthzV2Wrapper{testAuthzServer(&Config{EnablePerformanceMetrics: true}, withCustomLogger(&testPluginError{}))}
ctx := context.Background()
output, err := server.Check(ctx, &req)
if err != nil {
Expand All @@ -959,6 +1097,29 @@ func TestCheckWithLoggerErrorV2(t *testing.T) {
if output.Status.Message != expectedMsg {
t.Fatalf("Expected error message %v, but got %v", expectedMsg, output.Status.Message)
}

reg := prometheus.NewPedanticRegistry()
if err := reg.Register(server.v3.metricErrorCounter); err != nil {
panic(fmt.Errorf("registering collector failed: %w", err))
}

g := prometheus.ToTransactionalGatherer(reg)
fam, _, err := g.Gather()
if err != nil {
panic(err)
}
if len(fam) != 1 {
t.Fatalf("Expected 1 metric, got %d", len(fam))
}
if fam[0].Metric[0].Counter.GetValue() != 1 {
t.Fatalf("Expected counter value 1, got %v", fam[0].Metric[0].Counter.GetValue())
}
if len(fam[0].Metric[0].GetLabel()) != 1 {
t.Fatalf("Expected 1 counter metric label, got %v", fam[0].Metric[0].GetLabel())
}
if fam[0].Metric[0].GetLabel()[0].GetValue() != "log_unknown_error" {
t.Fatalf("Expected error metric with reason label log_unknown_error, got %v", fam[0].Metric[0].GetLabel()[0].GetValue())
}
}

func TestCheckBadDecisionWithLoggerV2(t *testing.T) {
Expand All @@ -970,7 +1131,7 @@ func TestCheckBadDecisionWithLoggerV2(t *testing.T) {
// create custom logger
customLogger := &testPlugin{}

server := envoyExtAuthzV2Wrapper{testAuthzServer(nil, withCustomLogger(customLogger))}
server := envoyExtAuthzV2Wrapper{testAuthzServer(&Config{EnablePerformanceMetrics: true}, withCustomLogger(customLogger))}
ctx := context.Background()
output, err := server.Check(ctx, &req)

Expand All @@ -992,6 +1153,29 @@ func TestCheckBadDecisionWithLoggerV2(t *testing.T) {
event.DecisionID == "" || event.Metrics == nil {
t.Fatalf("Unexpected events: %+v", customLogger.events)
}

reg := prometheus.NewPedanticRegistry()
if err := reg.Register(server.v3.metricErrorCounter); err != nil {
panic(fmt.Errorf("registering collector failed: %w", err))
}

g := prometheus.ToTransactionalGatherer(reg)
fam, _, err := g.Gather()
if err != nil {
panic(err)
}
if len(fam) != 1 {
t.Fatalf("Expected 1 metric, got %d", len(fam))
}
if fam[0].Metric[0].Counter.GetValue() != 1 {
t.Fatalf("Expected counter value 1, got %v", fam[0].Metric[0].Counter.GetValue())
}
if len(fam[0].Metric[0].GetLabel()) != 1 {
t.Fatalf("Expected 1 counter metric label, got %v", fam[0].Metric[0].GetLabel())
}
if fam[0].Metric[0].GetLabel()[0].GetValue() != "eval_unknown_error" {
t.Fatalf("Expected error metric with reason label eval_unknown_error, got %v", fam[0].Metric[0].GetLabel()[0].GetValue())
}
}

func TestCheckDenyWithLoggerV2(t *testing.T) {
Expand Down

0 comments on commit 3dc3bac

Please sign in to comment.