Skip to content

Commit 09281c6

Browse files
committed
etcdserver: fix incorrect metrics generated when clients cancel watches
Before this patch, a client which cancels the context for a watch results in the server generating a `rpctypes.ErrGRPCNoLeader` error that leads the recording of a gRPC `Unavailable` metric in association with the client watch cancellation. The metric looks like this: grpc_server_handled_total{grpc_code="Unavailable",grpc_method="Watch",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"} So, the watch server has misidentified the error as a server error and then propagates the mistake to metrics, leading to a false indicator that the leader has been lost. This false signal then leads to false alerting. This patch improves the behavior by: 1. Performing a deeper analysis during stream closure to more conclusively determine that a leader has actually been lost before propagating a ErrGRPCNoLeader error. 2. Returning a ErrGRPCWatchCanceled error if no conclusion can be drawn regarding leader loss. There remains an assumption that absence of evidence of leader loss means a client cancelled, but in practice this seems less likely to break down whereas client cancellations are frequent and expected. This is a continuation of the work already done in etcd-io#11375. Fixes etcd-io#10289, etcd-io#9725, etcd-io#9576, etcd-io#9166
1 parent 1af6d61 commit 09281c6

File tree

3 files changed

+22
-3
lines changed

3 files changed

+22
-3
lines changed

etcdserver/api/v3rpc/rpctypes/error.go

+2
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ var (
3535
ErrGRPCLeaseExist = status.New(codes.FailedPrecondition, "etcdserver: lease already exists").Err()
3636
ErrGRPCLeaseTTLTooLarge = status.New(codes.OutOfRange, "etcdserver: too large lease TTL").Err()
3737

38+
ErrGRPCWatchCanceled = status.New(codes.Canceled, "etcdserver: watch canceled").Err()
39+
3840
ErrGRPCMemberExist = status.New(codes.FailedPrecondition, "etcdserver: member ID already exist").Err()
3941
ErrGRPCPeerURLExist = status.New(codes.FailedPrecondition, "etcdserver: Peer URLs already exists").Err()
4042
ErrGRPCMemberNotEnoughStarted = status.New(codes.FailedPrecondition, "etcdserver: re-configuration failed due to not enough started members").Err()

etcdserver/api/v3rpc/watch.go

+16-2
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,16 @@ import (
2121
"sync"
2222
"time"
2323

24+
"google.golang.org/grpc/metadata"
25+
2426
"go.etcd.io/etcd/v3/auth"
2527
"go.etcd.io/etcd/v3/etcdserver"
2628
"go.etcd.io/etcd/v3/etcdserver/api/v3rpc/rpctypes"
2729
pb "go.etcd.io/etcd/v3/etcdserver/etcdserverpb"
2830
"go.etcd.io/etcd/v3/mvcc"
2931
"go.etcd.io/etcd/v3/mvcc/mvccpb"
32+
"go.etcd.io/etcd/v3/pkg/types"
33+
"go.etcd.io/etcd/v3/raft"
3034

3135
"go.uber.org/zap"
3236
)
@@ -191,9 +195,19 @@ func (ws *watchServer) Watch(stream pb.Watch_WatchServer) (err error) {
191195

192196
case <-stream.Context().Done():
193197
err = stream.Context().Err()
194-
// the only server-side cancellation is noleader for now.
195198
if err == context.Canceled {
196-
err = rpctypes.ErrGRPCNoLeader
199+
// Try to determine a more specific cancellation error. Use the stream context
200+
// and local leader state to detect leader loss. If the reason is inconclusive,
201+
// assume a client cancellation.
202+
if md, hasMetadata := metadata.FromIncomingContext(stream.Context()); hasMetadata {
203+
if rl := md[rpctypes.MetadataRequireLeaderKey]; len(rl) > 0 && rl[0] == rpctypes.MetadataHasLeader {
204+
if sws.sg.Leader() == types.ID(raft.None) {
205+
err = rpctypes.ErrGRPCNoLeader
206+
}
207+
}
208+
} else {
209+
err = rpctypes.ErrGRPCWatchCanceled
210+
}
197211
}
198212
}
199213

tests/e2e/metrics_test.go

+4-1
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@ func metricsTest(cx ctlCtx) {
4949
{"/metrics", fmt.Sprintf("etcd_mvcc_delete_total 3")},
5050
{"/metrics", fmt.Sprintf(`etcd_server_version{server_version="%s"} 1`, version.Version)},
5151
{"/metrics", fmt.Sprintf(`etcd_cluster_version{cluster_version="%s"} 1`, version.Cluster(version.Version))},
52+
{"/metrics", fmt.Sprintf(`grpc_server_handled_total{grpc_code="Canceled",grpc_method="Watch",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"} 6`)},
5253
{"/health", `{"health":"true","reason":""}`},
5354
} {
5455
i++
@@ -58,7 +59,9 @@ func metricsTest(cx ctlCtx) {
5859
if err := ctlV3Del(cx, []string{fmt.Sprintf("%d", i)}, 1); err != nil {
5960
cx.t.Fatal(err)
6061
}
61-
62+
if err := ctlV3Watch(cx, []string{"k", "--rev", "1"}, []kvExec{{key: "k", val: "v"}}...); err != nil {
63+
cx.t.Fatal(err)
64+
}
6265
if err := cURLGet(cx.epc, cURLReq{endpoint: test.endpoint, expected: test.expected, metricsURLScheme: cx.cfg.metricsURLScheme}); err != nil {
6366
cx.t.Fatalf("failed get with curl (%v)", err)
6467
}

0 commit comments

Comments
 (0)