txnal producer: work around KAFKA-12671, take 2

twmb · twmb · commit c7c08fb98688 · 2021-05-06T18:02:30.000-06:00
Rather than ever aborting anything when we could potentially be unsafe,
we will work with the logic we added to ensure we do not have broken
sequence numbers: only if we can abort, do we.

Secondly, we now wait for everything to be "flushed" (with the caveat
that now we can abort when flushing).

These two in combination should avoid KAFKA-12671.
diff --git a/pkg/kgo/producer.go b/pkg/kgo/producer.go
@@ -34,7 +34,6 @@ type producer struct {
 	flushing int32 // >0 if flushing, can Flush many times concurrently
 
 	aborting uint32 // 1 means yes
-	workers  int32  // number of sinks draining / number of in flight produce requests
 
 	idMu       sync.Mutex
 	idVersion  int16
@@ -66,9 +65,6 @@ func (p *producer) init() {
 	p.notifyCond = sync.NewCond(&p.notifyMu)
 }
 
-func (p *producer) incWorkers() { atomic.AddInt32(&p.workers, 1) }
-func (p *producer) decWorkers() { p.decAbortNotify(&p.workers) }
-
 func (p *producer) decAbortNotify(v *int32) {
 	if atomic.AddInt32(v, -1) != 0 || atomic.LoadUint32(&p.aborting) == 0 {
 		return
diff --git a/pkg/kgo/sink.go b/pkg/kgo/sink.go
@@ -38,13 +38,6 @@ type sink struct {
 	needBackoff bool
 	backoffSeq  uint32 // prevents pile on failures
 
-	// To work around KAFKA-12671, before we issue EndTxn, we check to see
-	// that all sinks had a final successful response. If not, then we risk
-	// running into KAFKA-12671 (out of order processing leading to
-	// orphaned begun "transaction" in ProducerStateManager), so rather
-	// than issuing EndTxn immediately, we wait a little bit.
-	lastRespSuccessful bool
-
 	// consecutiveFailures is incremented every backoff and cleared every
 	// successful response. For simplicity, if we have a good response
 	// following an error response before the error response's backoff
@@ -65,10 +58,9 @@ type seqResp struct {
 
 func (cl *Client) newSink(nodeID int32) *sink {
 	s := &sink{
-		cl:                 cl,
-		nodeID:             nodeID,
-		produceVersion:     -1,
-		lastRespSuccessful: true,
+		cl:             cl,
+		nodeID:         nodeID,
+		produceVersion: -1,
 	}
 	s.inflightSem.Store(make(chan struct{}, 1))
 	return s
@@ -231,16 +223,8 @@ func (s *sink) drain() {
 		time.Sleep(5 * time.Millisecond)
 	}
 
-	s.cl.producer.incWorkers()
-	defer s.cl.producer.decWorkers()
-
 	again := true
 	for again {
-		if s.cl.producer.isAborting() {
-			s.drainState.hardFinish()
-			return
-		}
-
 		s.maybeBackoff()
 
 		sem := s.inflightSem.Load().(chan struct{})
@@ -343,20 +327,9 @@ func (s *sink) produce(sem <-chan struct{}) bool {
 
 	req.backoffSeq = s.backoffSeq // safe to read outside mu since we are in drain loop
 
-	// Add that we are working and then check if we are aborting: this
-	// order ensures that we will do not produce after aborting is set.
-	p := &s.cl.producer
-	p.incWorkers()
-	if p.isAborting() {
-		p.decWorkers()
-		return false
-	}
-
 	produced = true
 	s.doSequenced(req, func(resp kmsg.Response, err error) {
-		s.lastRespSuccessful = err == nil
 		s.handleReqResp(req, resp, err)
-		p.decWorkers()
 		<-sem
 	})
 	return moreToDrain
@@ -1236,6 +1209,8 @@ func (b *recBatch) maybeFailErr(cfg *cfg) error {
 		return errRecordTimeout
 	} else if b.tries >= cfg.produceRetries {
 		return errRecordRetries
+	} else if b.owner.cl.producer.isAborting() {
+		return ErrAborting
 	}
 	return nil
 }
diff --git a/pkg/kgo/txn.go b/pkg/kgo/txn.go
@@ -164,37 +164,9 @@ func (s *GroupTransactSession) End(ctx context.Context, commit TransactionEndTry
 			return false, err // we do not abort below, because an error here is ctx closing
 		}
 	case TryAbort:
-		// If we have no buffered records, there is no need to abort
-		// buffered records and we can avoid resetting our producer ID.
-		if atomic.LoadInt64(&s.cl.producer.bufferedRecords) == 0 {
-			break
-		}
-
 		if err := s.cl.AbortBufferedRecords(ctx); err != nil {
 			return false, err // same
 		}
-		defer s.cl.ResetProducerID()
-
-		allOk := true
-		s.cl.sinksAndSourcesMu.Lock()
-		for _, sns := range s.cl.sinksAndSources {
-			allOk = allOk && sns.sink.lastRespSuccessful
-		}
-		s.cl.sinksAndSourcesMu.Unlock()
-
-		if !allOk {
-			s.cl.cfg.logger.Log(LogLevelWarn, "Buffered records were aborted, but some sink(s) did not have a final handled produce response. Kafka could still be handling these produce requests or have yet to handle them. We do not want to issue EndTxn before these produce requests are handled, because that would risk beginning a new transaction that we may not finish. Waiting 1s to give Kafka some time... (See KAFKA-12671)")
-			timer := time.NewTimer(time.Second)
-			select {
-			case <-timer.C:
-			case <-s.cl.ctx.Done():
-				timer.Stop()
-				return false, s.cl.ctx.Err()
-			case <-ctx.Done():
-				timer.Stop()
-				return false, ctx.Err()
-			}
-		}
 	}
 
 	wantCommit := bool(commit)
@@ -326,48 +298,51 @@ func (cl *Client) BeginTransaction() error {
 // Records produced during or after a call to this function may not be failed,
 // thus it is incorrect to concurrently produce with this function.
 func (cl *Client) AbortBufferedRecords(ctx context.Context) error {
-	atomic.StoreUint32(&cl.producer.aborting, 1)
-	defer atomic.StoreUint32(&cl.producer.aborting, 0)
-	atomic.AddInt32(&cl.producer.flushing, 1) // disallow lingering to start
-	defer atomic.AddInt32(&cl.producer.flushing, -1)
+	p := &cl.producer
+
+	atomic.StoreUint32(&p.aborting, 1)
+	defer atomic.StoreUint32(&p.aborting, 0)
+	atomic.AddInt32(&p.flushing, 1) // disallow lingering to start
+	defer atomic.AddInt32(&p.flushing, -1)
 	// At this point, all drain loops that start will immediately stop,
 	// thus they will not begin any AddPartitionsToTxn request. We must
 	// now wait for any req currently built to be done being issued.
 
 	cl.cfg.logger.Log(LogLevelInfo, "aborting buffered records")
 	defer cl.cfg.logger.Log(LogLevelDebug, "aborted buffered records")
 
-	cl.failBufferedRecords(ErrAborting)
+	// Similar to flushing, we unlinger; nothing will start a linger because
+	// the flushing atomic is non-zero.
+	if cl.cfg.linger > 0 || cl.cfg.manualFlushing {
+		for _, parts := range p.topics.load() {
+			for _, part := range parts.load().partitions {
+				part.records.unlingerAndManuallyDrain()
+			}
+		}
+	}
 
-	// Now, we wait for any active drain to stop. We must wait for all
-	// workers to stop otherwise we could end up with some exceptionally
-	// weird scenario where we end a txn and begin a new one before a
-	// prior AddPartitionsToTxn request that was built is issued.
-	//
-	// By waiting for our workers count to hit 0, we know that at that
-	// point, no new AddPartitionsToTxn request will be sent.
+	// We have to wait for all buffered records to either be flushed
+	// or to safely abort themselves.
 	quit := false
 	done := make(chan struct{})
 	go func() {
-		cl.producer.notifyMu.Lock()
-		defer cl.producer.notifyMu.Unlock()
+		p.notifyMu.Lock()
+		defer p.notifyMu.Unlock()
 		defer close(done)
 
-		for !quit && atomic.LoadInt32(&cl.producer.workers) > 0 {
-			cl.producer.notifyCond.Wait()
+		for !quit && atomic.LoadInt64(&p.bufferedRecords) > 0 {
+			p.notifyCond.Wait()
 		}
 	}()
 
 	select {
 	case <-done:
-		// All records were failed above, and all workers are stopped.
-		// We are safe to return.
 		return nil
 	case <-ctx.Done():
-		cl.producer.notifyMu.Lock()
+		p.notifyMu.Lock()
 		quit = true
-		cl.producer.notifyMu.Unlock()
-		cl.producer.notifyCond.Broadcast()
+		p.notifyMu.Unlock()
+		p.notifyCond.Broadcast()
 		return ctx.Err()
 	}
 }