Skip to content

Commit 6318b15

Browse files
committed
group consumer: revoke all on LeaveGroup, properly blocking commit
Leaving the group cancels the group context. This context is used in defaultRevoke, meaning that leaving a group prevents an offset commit in onRevoke to be successful. Now, we use the client context, which is only canceled after unsetting a consumer, which waits for the group to be left. Additionally, we now revoke all offsets if cooperative if we are leaving the group. Previously, we would rely on the manage logic to call onRevoke, but this also sometimes calls onLost. Technically with onLost, it is "too late" to commit at that point. We do not want to imply that if a user specifies their own OnLost. Finally, we default onLost to defaultRevoke. We implicitly did this before by falling into onRevoke if onLost was nil, but we may as well explicitly do this.
1 parent 887c134 commit 6318b15

File tree

1 file changed

+19
-10
lines changed

1 file changed

+19
-10
lines changed

pkg/kgo/consumer_group.go

+19-10
Original file line numberDiff line numberDiff line change
@@ -387,6 +387,7 @@ func (cl *Client) AssignGroup(group string, opts ...GroupOpt) {
387387
}
388388
if c.cl.cfg.txnID == nil {
389389
g.onRevoked = g.defaultRevoke
390+
g.onLost = g.defaultRevoke
390391
} else {
391392
g.autocommitDisable = true
392393
}
@@ -605,15 +606,19 @@ const (
605606
//
606607
// Lastly, for cooperative consumers, this must selectively delete what was
607608
// lost from the uncommitted map.
608-
func (g *groupConsumer) revoke(stage revokeStage, lost map[string][]int32) {
609-
if !g.cooperative { // stage == revokeThisSession if not cooperative
609+
func (g *groupConsumer) revoke(stage revokeStage, lost map[string][]int32, leaving bool) {
610+
if !g.cooperative || leaving { // stage == revokeThisSession if not cooperative
610611
// If we are an eager consumer, we stop fetching all of our
611612
// current partitions as we will be revoking them.
612613
g.c.mu.Lock()
613614
g.c.assignPartitions(nil, assignInvalidateAll)
614615
g.c.mu.Unlock()
615616

616-
g.cl.cfg.logger.Log(LogLevelInfo, "eager consumer revoking prior assigned partitions", "revoking", g.nowAssigned)
617+
if !g.cooperative {
618+
g.cl.cfg.logger.Log(LogLevelInfo, "eager consumer revoking prior assigned partitions", "revoking", g.nowAssigned)
619+
} else {
620+
g.cl.cfg.logger.Log(LogLevelInfo, "cooperative consumer revoking prior assigned partitions because leaving group", "revoking", g.nowAssigned)
621+
}
617622
if g.onRevoked != nil {
618623
g.onRevoked(g.ctx, g.nowAssigned)
619624
}
@@ -736,7 +741,7 @@ func (s *assignRevokeSession) prerevoke(g *groupConsumer, lost map[string][]int3
736741
go func() {
737742
defer close(s.prerevokeDone)
738743
if g.cooperative && len(lost) > 0 {
739-
g.revoke(revokeLastSession, lost)
744+
g.revoke(revokeLastSession, lost, false)
740745
}
741746
}()
742747
return s.prerevokeDone
@@ -765,13 +770,11 @@ func (s *assignRevokeSession) assign(g *groupConsumer, newAssigned map[string][]
765770
// This may not run before returning from the heartbeat loop: if we encounter a
766771
// fatal error, we return before revoking so that we can instead call onLost in
767772
// the manage loop.
768-
func (s *assignRevokeSession) revoke(g *groupConsumer) <-chan struct{} {
773+
func (s *assignRevokeSession) revoke(g *groupConsumer, leaving bool) <-chan struct{} {
769774
go func() {
770775
defer close(s.revokeDone)
771776
<-s.assignDone
772-
if g.onRevoked != nil {
773-
g.revoke(revokeThisSession, nil)
774-
}
777+
g.revoke(revokeThisSession, nil, leaving)
775778
}()
776779
return s.revokeDone
777780
}
@@ -960,7 +963,10 @@ func (g *groupConsumer) heartbeat(fetchErrCh <-chan error, s *assignRevokeSessio
960963
// Now we call the user provided revoke callback, even
961964
// if cooperative: if cooperative, this only revokes
962965
// partitions we no longer want to consume.
963-
revoked = s.revoke(g)
966+
//
967+
// If the err is context.Canceled, the group is being
968+
// left and we revoke everything.
969+
revoked = s.revoke(g, err == context.Canceled)
964970
}
965971
// Since we errored, while waiting for the revoke to finish, we
966972
// update our metadata. A leader may have re-joined with new
@@ -1930,7 +1936,10 @@ func (cl *Client) CommitOffsets(
19301936
func (g *groupConsumer) defaultRevoke(_ context.Context, _ map[string][]int32) {
19311937
if !g.autocommitDisable {
19321938
un := g.getUncommitted()
1933-
g.cl.BlockingCommitOffsets(g.ctx, un, func(_ *kmsg.OffsetCommitRequest, resp *kmsg.OffsetCommitResponse, err error) {
1939+
// We use the client's context rather than the group context,
1940+
// because this could come from the group being left. The group
1941+
// context will already be canceled.
1942+
g.cl.BlockingCommitOffsets(g.cl.ctx, un, func(_ *kmsg.OffsetCommitRequest, resp *kmsg.OffsetCommitResponse, err error) {
19341943
if err != nil {
19351944
if err != ErrNotGroup && err != context.Canceled {
19361945
g.cl.cfg.logger.Log(LogLevelError, "default revoke BlockingCommitOffsets failed", "err", err)

0 commit comments

Comments
 (0)