Skip to content

Commit

Permalink
metrics: support usage inside CSI driver
Browse files Browse the repository at this point in the history
A CSI driver is a gRPC server, which implies that the interceptor must
use a slightly different API. The extended CSIMetricsManager gets
exported with a suitable method for that.

To avoid counting the same operation twice in the same metric, CSI
sidecar and driver should use different subsystem names. Two new
functions provide the CSIMetricsManager that they are expected to use
for the sake of consistency.

However, special cases need additional flexibility:
- constant labels (same for all samples)
- varying labels (same label names, but per-sample values)
- configurable subsystem
- configurable stability
  • Loading branch information
pohly committed Aug 21, 2020
1 parent 771facd commit 7c7e7b3
Show file tree
Hide file tree
Showing 4 changed files with 249 additions and 60 deletions.
23 changes: 19 additions & 4 deletions connection/connection.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ func connect(
grpc.WithBlock(), // Block until connection succeeds.
grpc.WithChainUnaryInterceptor(
LogGRPC, // Log all messages.
extendedCSIMetricsManager{metricsManager}.recordMetricsInterceptor, // Record metrics for each gRPC call.
ExtendedCSIMetricsManager{metricsManager}.RecordMetricsClientInterceptor, // Record metrics for each gRPC call.
),
)
unixPrefix := "unix://"
Expand Down Expand Up @@ -187,12 +187,13 @@ func LogGRPC(ctx context.Context, method string, req, reply interface{}, cc *grp
return err
}

type extendedCSIMetricsManager struct {
type ExtendedCSIMetricsManager struct {
metrics.CSIMetricsManager
}

// recordMetricsInterceptor is a gPRC unary interceptor for recording metrics for CSI operations.
func (cmm extendedCSIMetricsManager) recordMetricsInterceptor(
// RecordMetricsClientInterceptor is a gPRC unary interceptor for recording metrics for CSI operations
// in a gRPC client.
func (cmm ExtendedCSIMetricsManager) RecordMetricsClientInterceptor(
ctx context.Context,
method string,
req, reply interface{},
Expand All @@ -209,3 +210,17 @@ func (cmm extendedCSIMetricsManager) recordMetricsInterceptor(
)
return err
}

// RecordMetricsServerInterceptor is a gPRC unary interceptor for recording metrics for CSI operations
// in a gRCP server.
func (cmm ExtendedCSIMetricsManager) RecordMetricsServerInterceptor(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (interface{}, error) {
start := time.Now()
resp, err := handler(ctx, req)
duration := time.Since(start)
cmm.RecordMetrics(
info.FullMethod, /* operationName */
err, /* operationErr */
duration, /* operationDuration */
)
return resp, err
}
94 changes: 64 additions & 30 deletions connection/connection_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,34 @@ const (
serverSock = "server.sock"
)

type identityServer struct{}

func (ids *identityServer) GetPluginInfo(ctx context.Context, req *csi.GetPluginInfoRequest) (*csi.GetPluginInfoResponse, error) {
return nil, status.Error(codes.Unimplemented, "Unimplemented")
}

func (ids *identityServer) Probe(ctx context.Context, req *csi.ProbeRequest) (*csi.ProbeResponse, error) {
return nil, status.Error(codes.Unimplemented, "Unimplemented")
}

func (ids *identityServer) GetPluginCapabilities(ctx context.Context, req *csi.GetPluginCapabilitiesRequest) (*csi.GetPluginCapabilitiesResponse, error) {
return nil, status.Error(codes.Unimplemented, "Unimplemented")
}

// startServer creates a gRPC server without any registered services.
// The returned address can be used to connect to it. The cleanup
// function stops it. It can be called multiple times.
func startServer(t *testing.T, tmp string, identity csi.IdentityServer, controller csi.ControllerServer) (string, func()) {
func startServer(t *testing.T, tmp string, identity csi.IdentityServer, controller csi.ControllerServer, cmm metrics.CSIMetricsManager) (string, func()) {
addr := path.Join(tmp, serverSock)
listener, err := net.Listen("unix", addr)
require.NoError(t, err, "listening on %s", addr)
server := grpc.NewServer()
var opts []grpc.ServerOption
if cmm != nil {
opts = append(opts,
grpc.UnaryInterceptor(ExtendedCSIMetricsManager{cmm}.RecordMetricsServerInterceptor),
)
}
server := grpc.NewServer(opts...)
if identity != nil {
csi.RegisterIdentityServer(server, identity)
}
Expand All @@ -85,7 +105,7 @@ func startServer(t *testing.T, tmp string, identity csi.IdentityServer, controll
func TestConnect(t *testing.T) {
tmp := tmpDir(t)
defer os.RemoveAll(tmp)
addr, stopServer := startServer(t, tmp, nil, nil)
addr, stopServer := startServer(t, tmp, nil, nil, nil)
defer stopServer()

conn, err := Connect(addr, metrics.NewCSIMetricsManager("fake.csi.driver.io"))
Expand All @@ -100,7 +120,7 @@ func TestConnect(t *testing.T) {
func TestConnectUnix(t *testing.T) {
tmp := tmpDir(t)
defer os.RemoveAll(tmp)
addr, stopServer := startServer(t, tmp, nil, nil)
addr, stopServer := startServer(t, tmp, nil, nil, nil)
defer stopServer()

conn, err := Connect("unix:///"+addr, metrics.NewCSIMetricsManager("fake.csi.driver.io"))
Expand Down Expand Up @@ -141,7 +161,7 @@ func TestWaitForServer(t *testing.T) {
t.Logf("sleeping %s before starting server", delay)
time.Sleep(delay)
startTimeServer = time.Now()
_, stopServer = startServer(t, tmp, nil, nil)
_, stopServer = startServer(t, tmp, nil, nil, nil)
}()
conn, err := Connect(path.Join(tmp, serverSock), metrics.NewCSIMetricsManager("fake.csi.driver.io"))
if assert.NoError(t, err, "connect via absolute path") {
Expand Down Expand Up @@ -175,7 +195,7 @@ func TestTimout(t *testing.T) {
func TestReconnect(t *testing.T) {
tmp := tmpDir(t)
defer os.RemoveAll(tmp)
addr, stopServer := startServer(t, tmp, nil, nil)
addr, stopServer := startServer(t, tmp, nil, nil, nil)
defer func() {
stopServer()
}()
Expand All @@ -202,7 +222,7 @@ func TestReconnect(t *testing.T) {
}

// No reconnection either when the server comes back.
_, stopServer = startServer(t, tmp, nil, nil)
_, stopServer = startServer(t, tmp, nil, nil, nil)
// We need to give gRPC some time. It does not attempt to reconnect
// immediately. If we send the method call too soon, the test passes
// even though a later method call will go through again.
Expand All @@ -220,7 +240,7 @@ func TestReconnect(t *testing.T) {
func TestDisconnect(t *testing.T) {
tmp := tmpDir(t)
defer os.RemoveAll(tmp)
addr, stopServer := startServer(t, tmp, nil, nil)
addr, stopServer := startServer(t, tmp, nil, nil, nil)
defer func() {
stopServer()
}()
Expand Down Expand Up @@ -251,7 +271,7 @@ func TestDisconnect(t *testing.T) {
}

// No reconnection either when the server comes back.
_, stopServer = startServer(t, tmp, nil, nil)
_, stopServer = startServer(t, tmp, nil, nil, nil)
// We need to give gRPC some time. It does not attempt to reconnect
// immediately. If we send the method call too soon, the test passes
// even though a later method call will go through again.
Expand All @@ -271,7 +291,7 @@ func TestDisconnect(t *testing.T) {
func TestExplicitReconnect(t *testing.T) {
tmp := tmpDir(t)
defer os.RemoveAll(tmp)
addr, stopServer := startServer(t, tmp, nil, nil)
addr, stopServer := startServer(t, tmp, nil, nil, nil)
defer func() {
stopServer()
}()
Expand Down Expand Up @@ -302,7 +322,7 @@ func TestExplicitReconnect(t *testing.T) {
}

// No reconnection either when the server comes back.
_, stopServer = startServer(t, tmp, nil, nil)
_, stopServer = startServer(t, tmp, nil, nil, nil)
// We need to give gRPC some time. It does not attempt to reconnect
// immediately. If we send the method call too soon, the test passes
// even though a later method call will go through again.
Expand All @@ -322,7 +342,10 @@ func TestExplicitReconnect(t *testing.T) {
func TestConnectMetrics(t *testing.T) {
tmp := tmpDir(t)
defer os.RemoveAll(tmp)
addr, stopServer := startServer(t, tmp, nil, nil)
cmmServer := metrics.NewCSIMetricsManagerForPlugin("fake.csi.driver.io")
// We have to have a real implementation of the gRPC call, otherwise the metrics
// interceptor is not called. The CSI identity service is used because it's simple.
addr, stopServer := startServer(t, tmp, &identityServer{}, nil, cmmServer)
defer stopServer()

cmm := metrics.NewCSIMetricsManager("fake.csi.driver.io")
Expand All @@ -332,38 +355,49 @@ func TestConnectMetrics(t *testing.T) {
defer conn.Close()
assert.Equal(t, connectivity.Ready, conn.GetState(), "connection ready")

if err := conn.Invoke(context.Background(), "/csi.v1.Controller/ControllerGetCapabilities", nil, nil); assert.Error(t, err) {
identityClient := csi.NewIdentityClient(conn)
if _, err := identityClient.GetPluginInfo(context.Background(), &csi.GetPluginInfoRequest{}); assert.Error(t, err) {
errStatus, _ := status.FromError(err)
assert.Equal(t, codes.Unimplemented, errStatus.Code(), "not implemented")
}
}

expectedMetrics := `# HELP csi_sidecar_operations_seconds [ALPHA] Container Storage Interface operation duration with gRPC error code status total
# TYPE csi_sidecar_operations_seconds histogram
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Controller/ControllerGetCapabilities",le="0.1"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Controller/ControllerGetCapabilities",le="0.25"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Controller/ControllerGetCapabilities",le="0.5"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Controller/ControllerGetCapabilities",le="1"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Controller/ControllerGetCapabilities",le="2.5"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Controller/ControllerGetCapabilities",le="5"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Controller/ControllerGetCapabilities",le="10"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Controller/ControllerGetCapabilities",le="15"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Controller/ControllerGetCapabilities",le="25"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Controller/ControllerGetCapabilities",le="50"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Controller/ControllerGetCapabilities",le="120"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Controller/ControllerGetCapabilities",le="300"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Controller/ControllerGetCapabilities",le="600"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Controller/ControllerGetCapabilities",le="+Inf"} 1
csi_sidecar_operations_seconds_sum{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Controller/ControllerGetCapabilities"} 0
csi_sidecar_operations_seconds_count{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Controller/ControllerGetCapabilities"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Identity/GetPluginInfo",le="0.1"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Identity/GetPluginInfo",le="0.25"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Identity/GetPluginInfo",le="0.5"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Identity/GetPluginInfo",le="1"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Identity/GetPluginInfo",le="2.5"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Identity/GetPluginInfo",le="5"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Identity/GetPluginInfo",le="10"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Identity/GetPluginInfo",le="15"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Identity/GetPluginInfo",le="25"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Identity/GetPluginInfo",le="50"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Identity/GetPluginInfo",le="120"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Identity/GetPluginInfo",le="300"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Identity/GetPluginInfo",le="600"} 1
csi_sidecar_operations_seconds_bucket{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Identity/GetPluginInfo",le="+Inf"} 1
csi_sidecar_operations_seconds_sum{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Identity/GetPluginInfo"} 0
csi_sidecar_operations_seconds_count{driver_name="fake.csi.driver.io",grpc_status_code="Unimplemented",method_name="/csi.v1.Identity/GetPluginInfo"} 1
`

if err := testutil.GatherAndCompare(
cmm.GetRegistry(), strings.NewReader(expectedMetrics)); err != nil {
// Ignore mismatches on csi_sidecar_operations_seconds_sum metric because execution time will vary from test to test.
err = verifyMetricsError(t, err, "csi_sidecar_operations_seconds_sum")
if err != nil {
t.Errorf("Expected metrics not found -- %v", err)
t.Errorf("Expected client metrics not found -- %v", err)
}
}

expectedMetrics = strings.Replace(expectedMetrics, "csi_sidecar", metrics.SubsystemPlugin, -1)
if err := testutil.GatherAndCompare(
cmmServer.GetRegistry(), strings.NewReader(expectedMetrics)); err != nil {
// Ignore mismatches on csi_sidecar_operations_seconds_sum metric because execution time will vary from test to test.
err = verifyMetricsError(t, err, metrics.SubsystemPlugin+"_operations_seconds_sum")
if err != nil {
t.Errorf("Expected server metrics not found -- %v", err)
}
}
}
Expand Down
Loading

0 comments on commit 7c7e7b3

Please sign in to comment.