Skip to content

Commit 192bf30

Browse files
committed
message tracker, better comments, use time.Now() everywhere (no point in dealing with millis)
1 parent 3754be9 commit 192bf30

9 files changed

+317
-140
lines changed

e2e/consumer.go

+51-52
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ import (
1111
"go.uber.org/zap"
1212
)
1313

14-
func (s *Service) ConsumeFromManagementTopic(ctx context.Context) error {
14+
func (s *Service) startConsumeMessages(ctx context.Context) {
1515
client := s.client
1616
topicName := s.config.TopicManagement.Name
1717
topic := kgo.ConsumeTopics(kgo.NewOffset().AtEnd(), topicName)
@@ -33,81 +33,73 @@ func (s *Service) ConsumeFromManagementTopic(ctx context.Context) error {
3333
for {
3434
select {
3535
case <-ctx.Done():
36-
return nil
36+
return
3737
default:
3838
fetches := client.PollFetches(ctx)
39+
receiveTimestamp := time.Now()
40+
41+
// Log all errors and continue afterwards as we might get errors and still have some fetch results
3942
errors := fetches.Errors()
4043
for _, err := range errors {
41-
// Log all errors and continue afterwards as we might get errors and still have some fetch results
4244
s.logger.Error("kafka fetch error",
4345
zap.String("topic", err.Topic),
4446
zap.Int32("partition", err.Partition),
4547
zap.Error(err.Err))
4648
}
4749

48-
receiveTimestampMs := timeNowMs()
49-
50-
//
5150
// Process messages
52-
iter := fetches.RecordIter()
53-
var record *kgo.Record
54-
for !iter.Done() {
55-
record = iter.Next()
56-
57-
if record == nil {
58-
continue
51+
fetches.EachRecord(func(record *kgo.Record) {
52+
if record != nil {
53+
s.processMessage(record, receiveTimestamp)
5954
}
55+
})
56+
}
57+
}
58+
}
6059

61-
s.processMessage(record, receiveTimestampMs)
62-
}
60+
func (s *Service) commitOffsets(ctx context.Context) {
61+
client := s.client
6362

64-
//
65-
// Commit offsets for processed messages
66-
// todo: the normal way to commit offsets with franz-go is pretty good, but in our special case
67-
// we want to do it manually, seperately for each partition, so we can track how long it took
68-
if uncommittedOffset := client.UncommittedOffsets(); uncommittedOffset != nil {
63+
//
64+
// Commit offsets for processed messages
65+
// todo: the normal way to commit offsets with franz-go is pretty good, but in our special case
66+
// we want to do it manually, seperately for each partition, so we can track how long it took
67+
if uncommittedOffset := client.UncommittedOffsets(); uncommittedOffset != nil {
6968

70-
startCommitTimestamp := timeNowMs()
69+
startCommitTimestamp := time.Now()
7170

72-
client.CommitOffsets(ctx, uncommittedOffset, func(req *kmsg.OffsetCommitRequest, r *kmsg.OffsetCommitResponse, err error) {
73-
// got commit response
71+
client.CommitOffsets(ctx, uncommittedOffset, func(req *kmsg.OffsetCommitRequest, r *kmsg.OffsetCommitResponse, err error) {
72+
// got commit response
73+
latency := time.Since(startCommitTimestamp)
7474

75-
latencyMs := timeNowMs() - startCommitTimestamp
76-
commitLatency := time.Duration(latencyMs * float64(time.Millisecond))
75+
if err != nil {
76+
s.logger.Error("offset commit failed", zap.Error(err), zap.Int64("latencyMilliseconds", latency.Milliseconds()))
77+
return
78+
}
7779

80+
for _, t := range r.Topics {
81+
for _, p := range t.Partitions {
82+
err := kerr.ErrorForCode(p.ErrorCode)
7883
if err != nil {
79-
s.logger.Error("offset commit failed", zap.Error(err), zap.Int64("latencyMilliseconds", commitLatency.Milliseconds()))
80-
return
81-
}
82-
83-
for _, t := range r.Topics {
84-
for _, p := range t.Partitions {
85-
err := kerr.ErrorForCode(p.ErrorCode)
86-
if err != nil {
87-
s.logger.Error("error committing partition offset", zap.String("topic", t.Topic), zap.Int32("partitionId", p.Partition), zap.Error(err))
88-
}
89-
}
90-
}
91-
92-
// only report commit latency if the coordinator wasn't set too long ago
93-
if time.Since(s.clientHooks.lastCoordinatorUpdate) < 10*time.Second {
94-
coordinator := s.clientHooks.currentCoordinator.Load().(kgo.BrokerMetadata)
95-
s.onOffsetCommit(coordinator.NodeID, commitLatency)
84+
s.logger.Error("error committing partition offset", zap.String("topic", t.Topic), zap.Int32("partitionId", p.Partition), zap.Error(err))
9685
}
97-
})
86+
}
9887
}
9988

100-
}
89+
// only report commit latency if the coordinator wasn't set too long ago
90+
if time.Since(s.clientHooks.lastCoordinatorUpdate) < 10*time.Second {
91+
coordinator := s.clientHooks.currentCoordinator.Load().(kgo.BrokerMetadata)
92+
s.onOffsetCommit(coordinator.NodeID, latency)
93+
}
94+
})
10195
}
102-
10396
}
10497

105-
// todo: then also create a "tracker" that knows about in-flight messages, and the latest successful roundtrips
106-
107-
// processMessage takes a message and:
108-
// - checks if it matches minionID and latency
109-
// - updates metrics accordingly
110-
func (s *Service) processMessage(record *kgo.Record, receiveTimestampMs float64) {
98+
// processMessage:
99+
// - deserializes the message
100+
// - checks if it is from us, or from another kminion process running somewhere else
101+
// - hands it off to the service, which then reports metrics on it
102+
func (s *Service) processMessage(record *kgo.Record, receiveTimestamp time.Time) {
111103
var msg EndToEndMessage
112104
if jerr := json.Unmarshal(record.Value, &msg); jerr != nil {
113105
return // maybe older version
@@ -117,7 +109,14 @@ func (s *Service) processMessage(record *kgo.Record, receiveTimestampMs float64)
117109
return // not from us
118110
}
119111

120-
latency := time.Duration((receiveTimestampMs - msg.Timestamp) * float64(time.Millisecond))
112+
// restore partition, which was not serialized
113+
msg.partition = int(record.Partition)
114+
115+
created := msg.creationTime()
116+
latency := receiveTimestamp.Sub(created)
121117

122118
s.onRoundtrip(record.Partition, latency)
119+
120+
// notify the tracker that the message arrived
121+
s.messageTracker.onMessageArrived(&msg)
123122
}

e2e/group_tracker.go

+3-3
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ type groupTracker struct {
3232
groupId string // our own groupId
3333
potentiallyEmptyGroups map[string]time.Time // groupName -> utc timestamp when the group was first seen
3434

35-
isNotAuthorized bool
35+
isNotAuthorized bool // if we get a not authorized response while trying to delete old groups, this will be set to true, essentially disabling the tracker
3636
}
3737

3838
func newGroupTracker(svc *Service, ctx context.Context) *groupTracker {
@@ -120,7 +120,7 @@ func (g *groupTracker) checkAndDeleteOldConsumerGroups() error {
120120
_, exists := g.potentiallyEmptyGroups[name]
121121
if !exists {
122122
// add it with the current timestamp
123-
now := time.Now().UTC()
123+
now := time.Now()
124124
g.potentiallyEmptyGroups[name] = now
125125
g.logger.Debug("new empty kminion group, adding to tracker", zap.String("group", name), zap.Time("firstSeen", now))
126126
}
@@ -134,7 +134,7 @@ func (g *groupTracker) checkAndDeleteOldConsumerGroups() error {
134134
exists, _ := containsStr(matchingGroups, name)
135135
if exists {
136136
// still there, check age and maybe delete it
137-
age := time.Now().UTC().Sub(firstSeen)
137+
age := time.Now().Sub(firstSeen)
138138
if age > oldGroupMaxAge {
139139
// group was unused for too long, delete it
140140
groupsToDelete = append(groupsToDelete, name)

e2e/message_tracker.go

+106
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
package e2e
2+
3+
import (
4+
"context"
5+
"time"
6+
7+
goCache "github.com/patrickmn/go-cache"
8+
"go.uber.org/zap"
9+
)
10+
11+
// messageTracker keeps track of messages (wow)
12+
//
13+
// When we successfully send a mesasge, it will be added to this tracker.
14+
// Later, when we receive the message back in the consumer, the message is marked as completed and removed from the tracker.
15+
// If the message does not arrive within the configured `consumer.roundtripSla`, it is counted as lost.
16+
// A lost message is reported in the `roundtrip_latency_seconds` metric with infinite duration,
17+
// but it would probably be a good idea to also have a metric that reports the number of lost messages.
18+
//
19+
// When we fail to send a message, it isn't tracked.
20+
//
21+
// todo: We should probably report that in the roundtrip metric as infinite duration.
22+
// since, if one broker is offline, we can't produce to the partition it leads,
23+
// but we are still able to produce to other partitions led by other brokers.
24+
// This should add at least a little protection against people who only alert on messages_produced and messages_received.
25+
//
26+
// Alternatively, maybe some sort of "failed count" metric could be a good idea?
27+
//
28+
type messageTracker struct {
29+
svc *Service
30+
logger *zap.Logger
31+
ctx context.Context
32+
cache *goCache.Cache
33+
}
34+
35+
func newMessageTracker(svc *Service) *messageTracker {
36+
37+
defaultExpirationTime := svc.config.Consumer.RoundtripSla
38+
cleanupInterval := 1 * time.Second
39+
40+
t := &messageTracker{
41+
svc: svc,
42+
logger: svc.logger.Named("message-tracker"),
43+
cache: goCache.New(defaultExpirationTime, cleanupInterval),
44+
}
45+
46+
t.cache.OnEvicted(func(key string, item interface{}) {
47+
t.onMessageExpired(key, item.(*EndToEndMessage))
48+
})
49+
50+
return t
51+
}
52+
53+
func (t *messageTracker) addToTracker(msg *EndToEndMessage) {
54+
t.cache.SetDefault(msg.MessageID, &msg)
55+
}
56+
57+
func (t *messageTracker) onMessageArrived(arrivedMessage *EndToEndMessage) {
58+
cachedMessageInterface, _, found := t.cache.GetWithExpiration(arrivedMessage.MessageID)
59+
if !found {
60+
// message expired and was removed from the cache
61+
// it arrived too late, nothing to do here...
62+
return
63+
}
64+
65+
actualExpireTime := arrivedMessage.creationTime().Add(t.svc.config.Consumer.RoundtripSla)
66+
if time.Now().Before(actualExpireTime) {
67+
// message arrived early enough
68+
69+
// timeUntilExpire := time.Until(actualExpireTime)
70+
// t.logger.Debug("message arrived",
71+
// zap.Duration("timeLeft", timeUntilExpire),
72+
// zap.Duration("age", ),
73+
// zap.Int("partition", msg.partition),
74+
// zap.String("messageId", msg.MessageID),
75+
// )
76+
} else {
77+
// Message arrived late, but was still in cache.
78+
// Maybe we could log something like "message arrived after the sla"...
79+
//
80+
// But for now we don't report it as "lost" in the log (because it actually *did* arrive just now, just too late).
81+
// The metrics will report it as 'duration infinite' anyway.
82+
}
83+
84+
// Set it as arrived, so we don't log it as lost in 'onMessageExpired' and remove it from the tracker
85+
msg := cachedMessageInterface.(*EndToEndMessage)
86+
msg.hasArrived = true
87+
t.cache.Delete(msg.MessageID)
88+
}
89+
90+
func (t *messageTracker) onMessageExpired(key string, msg *EndToEndMessage) {
91+
92+
if msg.hasArrived {
93+
// message did, in fact, arrive (doesn't matter here if soon enough of barely expired)
94+
// don't log anything
95+
return
96+
}
97+
98+
created := msg.creationTime()
99+
age := time.Since(created)
100+
101+
t.logger.Debug("message lost/expired",
102+
zap.Int64("ageMilliseconds", age.Milliseconds()),
103+
zap.Int("partition", msg.partition),
104+
zap.String("messageId", msg.MessageID),
105+
)
106+
}

e2e/partitioner.go

-8
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,6 @@ import (
1515
Why do we want to do that?
1616
We want to test all brokers with our "end-to-end test" test (sending message, receiving it again, measuring latency).
1717
To do that, we need to ensure we send a message to each broker.
18-
19-
20-
todo:
21-
Of course that also requires that we have exactly as many partitions as we have brokers,
22-
and that each broker leads exactly one of our test partitions.
23-
24-
However, we only create the topic initially (with the right amount of partitions and proper leader balancing over the brokers).
25-
So should two or more partitions of our topic end up being led (/hosted) by the same broker somehow, we neither detect nor fix that currently.
2618
*/
2719

2820
// Partitioner: Creates a TopicPartitioner for a given topic name

0 commit comments

Comments
 (0)