Skip to content

Commit 50e26d5

Browse files
authored
[close #395] Batch writes in Kafka consumer (#396)
* collect pprof heap Signed-off-by: Ping Yu <[email protected]> * unlimit retry for pd connection Signed-off-by: Ping Yu <[email protected]> * reduce record size Signed-off-by: Ping Yu <[email protected]> * log level: info Signed-off-by: Ping Yu <[email protected]> * reduce data size; add grafana panel Signed-off-by: Ping Yu <[email protected]> * batch Signed-off-by: Ping Yu <[email protected]> * fix Signed-off-by: Ping Yu <[email protected]> * try debug Signed-off-by: Ping Yu <[email protected]> * fix encoder size Signed-off-by: Ping Yu <[email protected]> * fix Signed-off-by: Ping Yu <[email protected]> * MQMessage pool Signed-off-by: Ping Yu <[email protected]> * fix release Signed-off-by: Ping Yu <[email protected]> * wip Signed-off-by: Ping Yu <[email protected]> * fix flaky ut Signed-off-by: Ping Yu <[email protected]> * logging Signed-off-by: Ping Yu <[email protected]> * fix ut Signed-off-by: Ping Yu <[email protected]> * wip Signed-off-by: Ping Yu <[email protected]> * adjust memory release parameter Signed-off-by: Ping Yu <[email protected]> * polish Signed-off-by: Ping Yu <[email protected]> * polish Signed-off-by: Ping Yu <[email protected]> * polish Signed-off-by: Ping Yu <[email protected]> * polish Signed-off-by: Ping Yu <[email protected]> * fix ut Signed-off-by: Ping Yu <[email protected]> * polish Signed-off-by: Ping Yu <[email protected]> --------- Signed-off-by: Ping Yu <[email protected]>
1 parent 40357a1 commit 50e26d5

File tree

8 files changed

+113
-58
lines changed

8 files changed

+113
-58
lines changed

cdc/cdc/sink/statistics.go

+1-1
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ import (
2525
)
2626

2727
const (
28-
printStatusInterval = 10 * time.Minute
28+
printStatusInterval = 30 * time.Second
2929
flushMetricsInterval = 5 * time.Second
3030
)
3131

cdc/cdc/sink/tikv.go

+22-12
Original file line numberDiff line numberDiff line change
@@ -257,7 +257,7 @@ type innerBatch struct {
257257
TTLs []uint64
258258
}
259259

260-
type tikvBatcher struct {
260+
type TikvBatcher struct {
261261
Batches []innerBatch
262262
count int
263263
byteSize uint64
@@ -266,22 +266,26 @@ type tikvBatcher struct {
266266
statistics *Statistics
267267
}
268268

269-
func newTiKVBatcher(statistics *Statistics) *tikvBatcher {
270-
b := &tikvBatcher{
269+
func NewTiKVBatcher(statistics *Statistics) *TikvBatcher {
270+
b := &TikvBatcher{
271271
statistics: statistics,
272272
}
273273
return b
274274
}
275275

276-
func (b *tikvBatcher) Count() int {
276+
func (b *TikvBatcher) Count() int {
277277
return b.count
278278
}
279279

280-
func (b *tikvBatcher) ByteSize() uint64 {
280+
func (b *TikvBatcher) IsEmpty() bool {
281+
return b.count == 0
282+
}
283+
284+
func (b *TikvBatcher) ByteSize() uint64 {
281285
return b.byteSize
282286
}
283287

284-
func (b *tikvBatcher) getNow() uint64 {
288+
func (b *TikvBatcher) getNow() uint64 {
285289
failpoint.Inject("tikvSinkGetNow", func(val failpoint.Value) {
286290
now := uint64(val.(int))
287291
failpoint.Return(now)
@@ -316,16 +320,18 @@ func ExtractRawKVEntry(entry *model.RawKVEntry, now uint64) (opType model.OpType
316320
return
317321
}
318322

319-
func (b *tikvBatcher) Append(entry *model.RawKVEntry) {
323+
func (b *TikvBatcher) Append(entry *model.RawKVEntry) error {
320324
if len(b.Batches) == 0 {
321325
b.now = b.getNow()
322326
}
323327

324328
opType, key, value, ttl, err := ExtractRawKVEntry(entry, b.now)
325329
if err != nil {
326330
log.Error("failed to extract entry", zap.Any("event", entry), zap.Error(err))
327-
b.statistics.AddInvalidKeyCount()
328-
return
331+
if b.statistics != nil {
332+
b.statistics.AddInvalidKeyCount()
333+
}
334+
return errors.Trace(err)
329335
}
330336

331337
// NOTE: do NOT separate PUT & DELETE operations into two batch.
@@ -353,9 +359,11 @@ func (b *tikvBatcher) Append(entry *model.RawKVEntry) {
353359
if opType == model.OpTypePut {
354360
b.byteSize += uint64(len(value)) + uint64(unsafe.Sizeof(ttl))
355361
}
362+
363+
return nil
356364
}
357365

358-
func (b *tikvBatcher) Reset() {
366+
func (b *TikvBatcher) Reset() {
359367
b.Batches = b.Batches[:0]
360368
b.count = 0
361369
b.byteSize = 0
@@ -370,7 +378,7 @@ func (k *tikvSink) runWorker(ctx context.Context, workerIdx uint32) error {
370378
tick := time.NewTicker(500 * time.Millisecond)
371379
defer tick.Stop()
372380

373-
batcher := newTiKVBatcher(k.statistics)
381+
batcher := NewTiKVBatcher(k.statistics)
374382

375383
flushToTiKV := func() error {
376384
return k.statistics.RecordBatchExecution(func() (int, error) {
@@ -426,7 +434,9 @@ func (k *tikvSink) runWorker(ctx context.Context, workerIdx uint32) error {
426434
}
427435
continue
428436
}
429-
batcher.Append(e.rawKVEntry)
437+
if err := batcher.Append(e.rawKVEntry); err != nil {
438+
return errors.Trace(err)
439+
}
430440

431441
if batcher.ByteSize() >= defaultTiKVBatchBytesLimit {
432442
if err := flushToTiKV(); err != nil {

cdc/cdc/sink/tikv_test.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ func TestTiKVSinkBatcher(t *testing.T) {
154154
}()
155155

156156
statistics := NewStatistics(context.Background(), "TiKV", map[string]string{})
157-
batcher := newTiKVBatcher(statistics)
157+
batcher := NewTiKVBatcher(statistics)
158158
keys := []string{
159159
"a", "b", "c", "d", "e", "f",
160160
}
@@ -183,8 +183,8 @@ func TestTiKVSinkBatcher(t *testing.T) {
183183
ExpiredTs: expires[i],
184184
CRTs: uint64(i),
185185
}
186-
batcher.Append(entry0)
187-
batcher.Append(entry1)
186+
require.NoError(batcher.Append(entry0))
187+
require.Error(batcher.Append(entry1))
188188
}
189189
require.Len(batcher.Batches, 3)
190190
require.Equal(6, batcher.Count())

cdc/cmd/kafka-consumer/main.go

+48-22
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ import (
1818
"flag"
1919
"fmt"
2020
"math"
21+
"math/rand"
2122
"net/url"
2223
"os"
2324
"os/signal"
@@ -44,7 +45,7 @@ import (
4445
)
4546

4647
const (
47-
downstreamRetryInterval = 500 * time.Millisecond
48+
downstreamRetryIntervalMs int = 200
4849
)
4950

5051
// Sarama configuration options
@@ -379,15 +380,45 @@ func (c *Consumer) ConsumeClaim(session sarama.ConsumerGroupSession, claim saram
379380
if sink == nil {
380381
panic("sink should initialized")
381382
}
382-
ClaimMessages:
383+
kvs := make([]*model.RawKVEntry, 0)
383384
for message := range claim.Messages() {
384385
log.Debug("Message claimed", zap.Int32("partition", message.Partition), zap.ByteString("key", message.Key), zap.ByteString("value", message.Value))
385386
batchDecoder, err := codec.NewJSONEventBatchDecoder(message.Key, message.Value)
386387
if err != nil {
387388
return errors.Trace(err)
388389
}
389390

391+
// Return error only when the session is closed
392+
emitChangedEvents := func() error {
393+
if len(kvs) == 0 {
394+
return nil
395+
}
396+
for {
397+
err = sink.EmitChangedEvents(ctx, kvs...)
398+
if err == nil {
399+
log.Debug("emit changed events", zap.Any("kvs", kvs))
400+
lastCRTs := sink.lastCRTs.Load()
401+
lastKv := kvs[len(kvs)-1]
402+
if lastCRTs < lastKv.CRTs {
403+
sink.lastCRTs.Store(lastKv.CRTs)
404+
}
405+
kvs = kvs[:0]
406+
return nil
407+
}
408+
409+
log.Warn("emit row changed event failed", zap.Error(err))
410+
if session.Context().Err() != nil {
411+
log.Warn("session closed", zap.Error(session.Context().Err()))
412+
return session.Context().Err()
413+
}
414+
415+
sleepMs := downstreamRetryIntervalMs + rand.Intn(downstreamRetryIntervalMs)
416+
time.Sleep(time.Duration(sleepMs) * time.Millisecond)
417+
}
418+
}
419+
390420
counter := 0
421+
KvLoop:
391422
for {
392423
tp, hasNext, err := batchDecoder.HasNext()
393424
if err != nil {
@@ -416,32 +447,21 @@ ClaimMessages:
416447
zap.Uint64("globalResolvedTs", globalResolvedTs),
417448
zap.Uint64("sinkResolvedTs", sink.resolvedTs.Load()),
418449
zap.Int32("partition", partition))
419-
break ClaimMessages
450+
continue KvLoop
420451
}
421452

422-
for {
423-
err = sink.EmitChangedEvents(ctx, kv)
424-
if err == nil {
425-
log.Debug("emit changed events", zap.Any("kv", kv))
426-
lastCRTs := sink.lastCRTs.Load()
427-
if lastCRTs < kv.CRTs {
428-
sink.lastCRTs.Store(kv.CRTs)
429-
}
430-
break
431-
}
432-
433-
log.Warn("emit row changed event failed", zap.Error(err))
434-
if session.Context().Err() != nil {
435-
log.Warn("session closed", zap.Error(session.Context().Err()))
436-
return nil
437-
}
438-
time.Sleep(downstreamRetryInterval)
439-
}
453+
kvs = append(kvs, kv)
440454
case model.MqMessageTypeResolved:
441455
ts, err := batchDecoder.NextResolvedEvent()
442456
if err != nil {
443457
log.Fatal("decode message value failed", zap.ByteString("value", message.Value))
444458
}
459+
460+
if err := emitChangedEvents(); err != nil {
461+
log.Info("session closed", zap.Error(err))
462+
return nil
463+
}
464+
445465
resolvedTs := sink.resolvedTs.Load()
446466
if resolvedTs < ts {
447467
log.Debug("update sink resolved ts",
@@ -450,13 +470,19 @@ ClaimMessages:
450470
sink.resolvedTs.Store(ts)
451471
}
452472
}
453-
session.MarkMessage(message, "")
454473
}
455474

456475
if counter > kafkaMaxBatchSize {
457476
log.Fatal("Open Protocol max-batch-size exceeded", zap.Int("max-batch-size", kafkaMaxBatchSize),
458477
zap.Int("actual-batch-size", counter))
459478
}
479+
480+
if err := emitChangedEvents(); err != nil {
481+
log.Info("session closed", zap.Error(err))
482+
return nil
483+
}
484+
485+
session.MarkMessage(message, "")
460486
}
461487

462488
return nil

cdc/cmd/kafka-consumer/tikv.go

+35-17
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@ import (
1717
"context"
1818
"math"
1919
"net/url"
20-
"time"
2120

2221
"github.com/pingcap/errors"
2322
"github.com/pingcap/kvproto/pkg/kvrpcpb"
@@ -30,15 +29,17 @@ import (
3029
)
3130

3231
const (
33-
defaultPDErrorRetry int = math.MaxInt
32+
defaultPDErrorRetry int = math.MaxInt
33+
defaultTiKVBatchBytesLimit uint64 = 40 * 1024 * 1024 // 40MB
3434
)
3535

3636
var _ sink.Sink = (*tikvSimpleSink)(nil)
3737

3838
// tikvSimpleSink is a sink that sends events to downstream TiKV cluster.
3939
// The reason why we need this sink other than `cdc/sink/tikv.tikvSink` is that we need Kafka message offset to handle TiKV errors, which is not provided by `tikvSink`.
4040
type tikvSimpleSink struct {
41-
client *rawkv.Client
41+
client *rawkv.Client
42+
batcher *sink.TikvBatcher
4243
}
4344

4445
func newSimpleTiKVSink(ctx context.Context, sinkURI *url.URL, _ *config.ReplicaConfig, opts map[string]string, _ chan error) (*tikvSimpleSink, error) {
@@ -56,34 +57,50 @@ func newSimpleTiKVSink(ctx context.Context, sinkURI *url.URL, _ *config.ReplicaC
5657
return nil, errors.Trace(err)
5758
}
5859
return &tikvSimpleSink{
59-
client: client,
60+
client: client,
61+
batcher: sink.NewTiKVBatcher(nil),
6062
}, nil
6163
}
6264

6365
func (s *tikvSimpleSink) EmitChangedEvents(ctx context.Context, rawKVEntries ...*model.RawKVEntry) error {
64-
now := uint64(time.Now().Unix())
66+
s.batcher.Reset()
6567

66-
for _, entry := range rawKVEntries {
67-
opType, key, value, ttl, err := sink.ExtractRawKVEntry(entry, now)
68-
if err != nil {
69-
return errors.Trace(err)
68+
flushToTiKV := func() error {
69+
if s.batcher.IsEmpty() {
70+
return nil
7071
}
7172

72-
if opType == model.OpTypePut {
73-
err := s.client.PutWithTTL(ctx, key, value, ttl)
73+
var err error
74+
for _, batch := range s.batcher.Batches {
75+
if batch.OpType == model.OpTypePut {
76+
err = s.client.BatchPutWithTTL(ctx, batch.Keys, batch.Values, batch.TTLs)
77+
} else if batch.OpType == model.OpTypeDelete {
78+
err = s.client.BatchDelete(ctx, batch.Keys)
79+
} else {
80+
err = errors.Errorf("unexpected OpType: %v", batch.OpType)
81+
}
7482
if err != nil {
7583
return errors.Trace(err)
7684
}
77-
} else if opType == model.OpTypeDelete {
78-
err := s.client.Delete(ctx, key)
79-
if err != nil {
85+
}
86+
s.batcher.Reset()
87+
return nil
88+
}
89+
90+
for _, entry := range rawKVEntries {
91+
err := s.batcher.Append(entry)
92+
if err != nil {
93+
return errors.Trace(err)
94+
}
95+
96+
if s.batcher.ByteSize() >= defaultTiKVBatchBytesLimit {
97+
if err := flushToTiKV(); err != nil {
8098
return errors.Trace(err)
8199
}
82-
} else {
83-
return errors.Errorf("unexpected opType %v", opType)
84100
}
85101
}
86-
return nil
102+
103+
return errors.Trace(flushToTiKV())
87104
}
88105

89106
func (s *tikvSimpleSink) FlushChangedEvents(ctx context.Context, _ model.KeySpanID, resolvedTs uint64) (uint64, error) {
@@ -95,6 +112,7 @@ func (s *tikvSimpleSink) EmitCheckpointTs(ctx context.Context, ts uint64) error
95112
}
96113

97114
func (s *tikvSimpleSink) Close(ctx context.Context) error {
115+
s.batcher.Reset()
98116
return errors.Trace(s.client.Close())
99117
}
100118

cdc/deployments/tikv-cdc/docker-compose/docker-compose-kafka-integration.yml

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
version: '2.1'
1+
version: "3"
22

33
services:
44
zookeeper:

cdc/tests/integration_tests/_utils/test_prepare

+2-1
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ function on_exit() {
8080
return 0
8181
else
8282
echo "Error $STATUS_CODE occurred on $LINE for sink $SINK_TYPE"
83-
tail -n +1 "$WORK_DIR"/cdc*.log
83+
# CI env already collect "*.log". Uncomment it for other envs.
84+
# tail -n +1 "$WORK_DIR"/cdc*.log
8485
fi
8586
}

cdc/tests/integration_tests/changefeed_pause_resume/run.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ function run() {
2828
run_kafka_consumer --workdir "$WORK_DIR" --upstream-uri "$SINK_URI"
2929
fi
3030

31-
for i in $(seq 1 10); do
31+
for _ in $(seq 1 10); do
3232
tikv-cdc cli changefeed pause --changefeed-id=$changefeed_id --pd=$UP_PD
3333
rawkv_op $UP_PD put 5000
3434
tikv-cdc cli changefeed resume --changefeed-id=$changefeed_id --pd=$UP_PD

0 commit comments

Comments
 (0)