Skip to content

Commit

Permalink
- remove SLA gauge metrics, they can be inferred from the 'inifinite'…
Browse files Browse the repository at this point in the history
… bucket in the latency histograms

- methods to handle metric events (onAck, onRoundtrip, onOffsetCommit); will also soon be used to extract e2e into its own package
- add _total and _seconds suffixes to metrics for best practices
  • Loading branch information
rikimaru0345 committed Apr 29, 2021
1 parent a1ca198 commit cc9a4e9
Show file tree
Hide file tree
Showing 4 changed files with 75 additions and 78 deletions.
42 changes: 14 additions & 28 deletions minion/endtoend_consumer.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ func (s *Service) ConsumeFromManagementTopic(ctx context.Context) error {
zap.Error(err.Err))
}

receiveTimestamp := timeNowMs()
receiveTimestampMs := timeNowMs()

//
// Process messages
Expand All @@ -59,13 +59,13 @@ func (s *Service) ConsumeFromManagementTopic(ctx context.Context) error {
continue
}

s.processMessage(record, receiveTimestamp)
s.processMessage(record, receiveTimestampMs)
}

//
// Commit offsets for processed messages
// todo:
// - do we need to keep track of what offset to commit for which partition??
// todo: the normal way to commit offsets with franz-go is pretty good, but in our special case
// we want to do it manually, seperately for each partition, so we can track how long it took
if uncommittedOffset := client.UncommittedOffsets(); uncommittedOffset != nil {

startCommitTimestamp := timeNowMs()
Expand All @@ -74,18 +74,14 @@ func (s *Service) ConsumeFromManagementTopic(ctx context.Context) error {
// got commit response
if err != nil {
s.logger.Error(fmt.Sprintf("record had an error on commit: %v\n", err))
s.setCachedItem("end_to_end_consumer_offset_availability", false, 120*time.Second)
} else {
commitLatencySec := float64(timeNowMs()-startCommitTimestamp) / float64(1000)
s.endToEndCommitLatency.Observe(commitLatencySec)
s.endToEndMessagesCommitted.Inc()

if commitLatencySec <= s.Cfg.EndToEnd.Consumer.CommitSla.Seconds() {
s.endToEndWithinCommitSla.Set(1)
} else {
s.endToEndWithinCommitSla.Set(0)
}
return
}

latencyMs := timeNowMs() - startCommitTimestamp
commitLatency := time.Duration(latencyMs * float64(time.Millisecond))

// todo: partitionID
s.onOffsetCommit(0, commitLatency)
})
}
}
Expand All @@ -99,7 +95,7 @@ func (s *Service) ConsumeFromManagementTopic(ctx context.Context) error {
// processMessage takes a message and:
// - checks if it matches minionID and latency
// - updates metrics accordingly
func (s *Service) processMessage(record *kgo.Record, receiveTimestamp int64) {
func (s *Service) processMessage(record *kgo.Record, receiveTimestampMs float64) {
var msg EndToEndMessage
if jerr := json.Unmarshal(record.Value, &msg); jerr != nil {
return // maybe older version
Expand All @@ -109,17 +105,7 @@ func (s *Service) processMessage(record *kgo.Record, receiveTimestamp int64) {
return // not from us
}

if msg.Timestamp < s.lastRoundtripTimestamp {
return // msg older than what we recently processed (out of order, should never happen)
}

latencyMs := receiveTimestamp - msg.Timestamp
if latencyMs > s.Cfg.EndToEnd.Consumer.RoundtripSla.Milliseconds() {
s.endToEndWithinRoundtripSla.Set(0)
return // too late!
}
latency := time.Duration((receiveTimestampMs - msg.Timestamp) * float64(time.Millisecond))

s.lastRoundtripTimestamp = msg.Timestamp
s.endToEndMessagesReceived.Inc()
s.endToEndRoundtripLatency.Observe(float64(latencyMs) / 1000)
s.onRoundtrip(record.Partition, latency)
}
16 changes: 6 additions & 10 deletions minion/endtoend_producer.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,15 @@ import (
"context"
"encoding/json"
"fmt"
"time"

"github.com/google/uuid"
"github.com/twmb/franz-go/pkg/kgo"
)

type EndToEndMessage struct {
MinionID string `json:"minionID"`
Timestamp int64 `json:"timestamp"`
MinionID string `json:"minionID"`
Timestamp float64 `json:"timestamp"`
}

func (s *Service) produceToManagementTopic(ctx context.Context) error {
Expand All @@ -33,18 +34,13 @@ func (s *Service) produceToManagementTopic(ctx context.Context) error {

err = s.kafkaSvc.Client.Produce(ctx, record, func(r *kgo.Record, err error) {
endTime := timeNowMs()
ackDuration := endTime - startTime
ackDurationMs := endTime - startTime
ackDuration := time.Duration(ackDurationMs) * time.Millisecond

if err != nil {
fmt.Printf("record had a produce error: %v\n", err)
} else {
s.endToEndMessagesAcked.Inc()

if ackDuration < s.Cfg.EndToEnd.Producer.AckSla.Milliseconds() {
s.endToEndWithinRoundtripSla.Set(1)
} else {
s.endToEndWithinRoundtripSla.Set(0)
}
s.onAck(r.Partition, ackDuration)
}
})

Expand Down
10 changes: 5 additions & 5 deletions minion/endtoend_topic.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ func (s *Service) validateManagementTopic(ctx context.Context) error {
// TopicMetadataArray could be empty, therefore needs to do this check beforehand
topicMetadataArray := topicMetadata.Topics
if len(topicMetadataArray) == 0 {
return fmt.Errorf("Unable to retrieve metadata, please make sure the brokers are up and/or you have right to access them")
return fmt.Errorf("unable to retrieve metadata, please make sure the brokers are up and/or you have right to access them")
}
doesTopicReachable := topicMetadata.Topics[0].Topic != ""
if !doesTopicReachable {
return fmt.Errorf("Unable to retrieve metadata, please make sure the brokers are up and/or you have right to access them")
return fmt.Errorf("unable to retrieve metadata, please make sure the brokers are up and/or you have right to access them")
}

// Create the management end to end topic if it does not exist
Expand All @@ -43,7 +43,7 @@ func (s *Service) validateManagementTopic(ctx context.Context) error {
// topicMetadata.Brokers will return all the available brokers from the cluster
isNumBrokerValid := len(topicMetadata.Brokers) >= expectedReplicationFactor
if !isNumBrokerValid {
return fmt.Errorf("Current cluster size differs from the expected size. Expected broker: %v NumOfBroker: %v", len(topicMetadata.Brokers), expectedReplicationFactor)
return fmt.Errorf("current cluster size differs from the expected size. expected broker: %v NumOfBroker: %v", len(topicMetadata.Brokers), expectedReplicationFactor)
}

// Check the number of Partition per broker, if it is too low create partition
Expand Down Expand Up @@ -244,6 +244,6 @@ func (s *Service) initEndToEnd(ctx context.Context) {
}
}

func timeNowMs() int64 {
return time.Now().UnixNano() / int64(time.Millisecond)
func timeNowMs() float64 {
return float64(time.Now().UnixNano()) / float64(time.Millisecond)
}
85 changes: 50 additions & 35 deletions minion/service.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,25 +36,18 @@ type Service struct {
storage *Storage

// EndToEnd
minionID string // unique identifier, reported in metrics, in case multiple instances run at the same time
lastRoundtripTimestamp int64 // creation time (in utc ms) of the message that most recently passed the roundtripSla check
minionID string // unique identifier, reported in metrics, in case multiple instances run at the same time
lastRoundtripTimestamp float64 // creation time (in utc ms) of the message that most recently passed the roundtripSla check

// EndToEnd Metrics
endToEndMessagesProduced prometheus.Counter
endToEndMessagesAcked prometheus.Counter
endToEndMessagesReceived prometheus.Counter
endToEndMessagesCommitted prometheus.Counter

endToEndWithinAckSla prometheus.Gauge
endToEndWithinRoundtripSla prometheus.Gauge
endToEndWithinCommitSla prometheus.Gauge

endToEndProduceLatency prometheus.Histogram
endToEndAckLatency prometheus.Histogram
endToEndRoundtripLatency prometheus.Histogram
endToEndCommitLatency prometheus.Histogram

// todo: produce latency histogram

}

func NewService(cfg Config, logger *zap.Logger, kafkaSvc *kafka.Service, metricNamespace string) (*Service, error) {
Expand Down Expand Up @@ -92,14 +85,6 @@ func NewService(cfg Config, logger *zap.Logger, kafkaSvc *kafka.Service, metricN

// End-to-End metrics
if cfg.EndToEnd.Enabled {
makeGauge := func(name string, help string) prometheus.Gauge {
return promauto.NewGauge(prometheus.GaugeOpts{
Namespace: metricNamespace,
Subsystem: "end_to_end",
Name: name,
Help: help,
})
}
makeCounter := func(name string, help string) prometheus.Counter {
return promauto.NewCounter(prometheus.CounterOpts{
Namespace: metricNamespace,
Expand All @@ -119,26 +104,18 @@ func NewService(cfg Config, logger *zap.Logger, kafkaSvc *kafka.Service, metricN
}

// Low-level info
// Users can construct stuff like "message commits failed" themselves from those
service.endToEndMessagesProduced = makeCounter("messages_produced", "Number of messages that kminion's end-to-end test has tried to send to kafka")
service.endToEndMessagesAcked = makeCounter("messages_acked", "Number of messages kafka acknowledged as produced")
service.endToEndMessagesReceived = makeCounter("messages_received", "Number of *matching* messages kminion received. Every roundtrip message has a minionID (randomly generated on startup) and a timestamp. Kminion only considers a message a match if it it arrives within the configured roundtrip SLA (and it matches the minionID)")
service.endToEndMessagesCommitted = makeCounter("messages_committed", "Number of *matching* messages kminion successfully commited as read/processed. See 'messages_received' for what 'matching' means. Kminion will commit late/mismatching messages to kafka as well, but those won't be counted in this metric.")

// High-level SLA reporting
// Simple gauges that report if stuff is within the configured SLAs
// Naturally those will potentially not trigger if, for example, only a single message is lost in-between scrap intervals.
gaugeHelp := "Will be either 0 (false) or 1 (true), depending on the durations (SLAs) configured in kminion's config"
service.endToEndWithinAckSla = makeGauge("is_within_ack_sla", "Reports whether messages can be produced. A message is only considered 'produced' when the broker has sent an ack within the configured timeout. "+gaugeHelp)
service.endToEndWithinRoundtripSla = makeGauge("is_within_roundtrip_sla", "Reports whether or not kminion receives the test messages it produces within the configured timeout. "+gaugeHelp)
service.endToEndWithinCommitSla = makeGauge("is_within_commit_sla", "Reports whether or not kminion can successfully commit offsets for the messages it receives/processes within the configured timeout. "+gaugeHelp)
// Users can construct alerts like "can't produce messages" themselves from those
service.endToEndMessagesProduced = makeCounter("messages_produced_total", "Number of messages that kminion's end-to-end test has tried to send to kafka")
service.endToEndMessagesAcked = makeCounter("messages_acked_total", "Number of messages kafka acknowledged as produced")
service.endToEndMessagesReceived = makeCounter("messages_received_total", "Number of *matching* messages kminion received. Every roundtrip message has a minionID (randomly generated on startup) and a timestamp. Kminion only considers a message a match if it it arrives within the configured roundtrip SLA (and it matches the minionID)")
service.endToEndMessagesCommitted = makeCounter("messages_committed_total", "Number of *matching* messages kminion successfully commited as read/processed. See 'messages_received' for what 'matching' means. Kminion will commit late/mismatching messages to kafka as well, but those won't be counted in this metric.")

// Latency Histograms
// More detailed info about how long stuff took
// Since histograms also have an 'infinite' bucket, they can be used to detect small hickups that won't trigger the SLA gauges
service.endToEndProduceLatency = makeHistogram("produce_latency", cfg.EndToEnd.Producer.AckSla, "Time until we received an ack for a produced message")
service.endToEndRoundtripLatency = makeHistogram("roundtrip_latency", cfg.EndToEnd.Consumer.RoundtripSla, "Time it took between sending (producing) and receiving (consuming) a message")
service.endToEndCommitLatency = makeHistogram("commit_latency", cfg.EndToEnd.Consumer.CommitSla, "Time kafka took to respond to kminion's offset commit")
// Since histograms also have an 'infinite' bucket, they can be used to detect small hickups "lost" messages
service.endToEndAckLatency = makeHistogram("produce_latency_seconds", cfg.EndToEnd.Producer.AckSla, "Time until we received an ack for a produced message")
service.endToEndRoundtripLatency = makeHistogram("roundtrip_latency_seconds", cfg.EndToEnd.Consumer.RoundtripSla, "Time it took between sending (producing) and receiving (consuming) a message")
service.endToEndCommitLatency = makeHistogram("commit_latency_seconds", cfg.EndToEnd.Consumer.CommitSla, "Time kafka took to respond to kminion's offset commit")
}

return service, nil
Expand Down Expand Up @@ -236,3 +213,41 @@ func createHistogramBuckets(maxLatency time.Duration) []float64 {

return bucket
}

// called from e2e when a message is acknowledged
func (s *Service) onAck(partitionId int32, duration time.Duration) {
s.endToEndMessagesAcked.Inc()
s.endToEndAckLatency.Observe(duration.Seconds())
}

// called from e2e when a message completes a roundtrip (send to kafka, receive msg from kafka again)
func (s *Service) onRoundtrip(partitionId int32, duration time.Duration) {
if duration > s.Cfg.EndToEnd.Consumer.RoundtripSla {
return // message is too old
}

// todo: track "lastRoundtripMessage"
// if msg.Timestamp < s.lastRoundtripTimestamp {
// return // msg older than what we recently processed (out of order, should never happen)
// }

s.endToEndMessagesReceived.Inc()
s.endToEndRoundtripLatency.Observe(duration.Seconds())
}

// called from e2e when an offset commit is confirmed
func (s *Service) onOffsetCommit(partitionId int32, duration time.Duration) {

// todo:
// if the commit took too long, don't count it in 'commits' but add it to the histogram?
// and how do we want to handle cases where we get an error??
// should we have another metric that tells us about failed commits? or a label on the counter?

s.endToEndCommitLatency.Observe(duration.Seconds())

if duration > s.Cfg.EndToEnd.Consumer.CommitSla {
return
}

s.endToEndMessagesCommitted.Inc()
}

0 comments on commit cc9a4e9

Please sign in to comment.