- remove SLA gauge metrics, they can be inferred from the 'inifinite' bucket in the latency histograms

rikimaru0345 · rikimaru0345 · commit cc9a4e91c64b · 2021-04-29T17:20:07.000+02:00
- methods to handle metric events (onAck, onRoundtrip, onOffsetCommit); will also soon be used to extract e2e into its own package
- add _total and _seconds suffixes to metrics for best practices
diff --git a/minion/endtoend_consumer.go b/minion/endtoend_consumer.go
@@ -46,7 +46,7 @@ func (s *Service) ConsumeFromManagementTopic(ctx context.Context) error {
 					zap.Error(err.Err))
 			}
 
-			receiveTimestamp := timeNowMs()
+			receiveTimestampMs := timeNowMs()
 
 			//
 			// Process messages
@@ -59,13 +59,13 @@ func (s *Service) ConsumeFromManagementTopic(ctx context.Context) error {
 					continue
 				}
 
-				s.processMessage(record, receiveTimestamp)
+				s.processMessage(record, receiveTimestampMs)
 			}
 
 			//
 			// Commit offsets for processed messages
-			// todo:
-			// - do we need to keep track of what offset to commit for which partition??
+			// todo: the normal way to commit offsets with franz-go is pretty good, but in our special case
+			// 		 we want to do it manually, seperately for each partition, so we can track how long it took
 			if uncommittedOffset := client.UncommittedOffsets(); uncommittedOffset != nil {
 
 				startCommitTimestamp := timeNowMs()
@@ -74,18 +74,14 @@ func (s *Service) ConsumeFromManagementTopic(ctx context.Context) error {
 					// got commit response
 					if err != nil {
 						s.logger.Error(fmt.Sprintf("record had an error on commit: %v\n", err))
-						s.setCachedItem("end_to_end_consumer_offset_availability", false, 120*time.Second)
-					} else {
-						commitLatencySec := float64(timeNowMs()-startCommitTimestamp) / float64(1000)
-						s.endToEndCommitLatency.Observe(commitLatencySec)
-						s.endToEndMessagesCommitted.Inc()
-
-						if commitLatencySec <= s.Cfg.EndToEnd.Consumer.CommitSla.Seconds() {
-							s.endToEndWithinCommitSla.Set(1)
-						} else {
-							s.endToEndWithinCommitSla.Set(0)
-						}
+						return
 					}
+
+					latencyMs := timeNowMs() - startCommitTimestamp
+					commitLatency := time.Duration(latencyMs * float64(time.Millisecond))
+
+					// todo: partitionID
+					s.onOffsetCommit(0, commitLatency)
 				})
 			}
 		}
@@ -99,7 +95,7 @@ func (s *Service) ConsumeFromManagementTopic(ctx context.Context) error {
 // processMessage takes a message and:
 // - checks if it matches minionID and latency
 // - updates metrics accordingly
-func (s *Service) processMessage(record *kgo.Record, receiveTimestamp int64) {
+func (s *Service) processMessage(record *kgo.Record, receiveTimestampMs float64) {
 	var msg EndToEndMessage
 	if jerr := json.Unmarshal(record.Value, &msg); jerr != nil {
 		return // maybe older version
@@ -109,17 +105,7 @@ func (s *Service) processMessage(record *kgo.Record, receiveTimestamp int64) {
 		return // not from us
 	}
 
-	if msg.Timestamp < s.lastRoundtripTimestamp {
-		return // msg older than what we recently processed (out of order, should never happen)
-	}
-
-	latencyMs := receiveTimestamp - msg.Timestamp
-	if latencyMs > s.Cfg.EndToEnd.Consumer.RoundtripSla.Milliseconds() {
-		s.endToEndWithinRoundtripSla.Set(0)
-		return // too late!
-	}
+	latency := time.Duration((receiveTimestampMs - msg.Timestamp) * float64(time.Millisecond))
 
-	s.lastRoundtripTimestamp = msg.Timestamp
-	s.endToEndMessagesReceived.Inc()
-	s.endToEndRoundtripLatency.Observe(float64(latencyMs) / 1000)
+	s.onRoundtrip(record.Partition, latency)
 }
diff --git a/minion/endtoend_producer.go b/minion/endtoend_producer.go
@@ -4,14 +4,15 @@ import (
 	"context"
 	"encoding/json"
 	"fmt"
+	"time"
 
 	"github.com/google/uuid"
 	"github.com/twmb/franz-go/pkg/kgo"
 )
 
 type EndToEndMessage struct {
-	MinionID  string `json:"minionID"`
-	Timestamp int64  `json:"timestamp"`
+	MinionID  string  `json:"minionID"`
+	Timestamp float64 `json:"timestamp"`
 }
 
 func (s *Service) produceToManagementTopic(ctx context.Context) error {
@@ -33,18 +34,13 @@ func (s *Service) produceToManagementTopic(ctx context.Context) error {
 
 			err = s.kafkaSvc.Client.Produce(ctx, record, func(r *kgo.Record, err error) {
 				endTime := timeNowMs()
-				ackDuration := endTime - startTime
+				ackDurationMs := endTime - startTime
+				ackDuration := time.Duration(ackDurationMs) * time.Millisecond
 
 				if err != nil {
 					fmt.Printf("record had a produce error: %v\n", err)
 				} else {
-					s.endToEndMessagesAcked.Inc()
-
-					if ackDuration < s.Cfg.EndToEnd.Producer.AckSla.Milliseconds() {
-						s.endToEndWithinRoundtripSla.Set(1)
-					} else {
-						s.endToEndWithinRoundtripSla.Set(0)
-					}
+					s.onAck(r.Partition, ackDuration)
 				}
 			})
 
diff --git a/minion/endtoend_topic.go b/minion/endtoend_topic.go
@@ -22,11 +22,11 @@ func (s *Service) validateManagementTopic(ctx context.Context) error {
 	// TopicMetadataArray could be empty, therefore needs to do this check beforehand
 	topicMetadataArray := topicMetadata.Topics
 	if len(topicMetadataArray) == 0 {
-		return fmt.Errorf("Unable to retrieve metadata, please make sure the brokers are up and/or you have right to access them")
+		return fmt.Errorf("unable to retrieve metadata, please make sure the brokers are up and/or you have right to access them")
 	}
 	doesTopicReachable := topicMetadata.Topics[0].Topic != ""
 	if !doesTopicReachable {
-		return fmt.Errorf("Unable to retrieve metadata, please make sure the brokers are up and/or you have right to access them")
+		return fmt.Errorf("unable to retrieve metadata, please make sure the brokers are up and/or you have right to access them")
 	}
 
 	// Create the management end to end topic if it does not exist
@@ -43,7 +43,7 @@ func (s *Service) validateManagementTopic(ctx context.Context) error {
 	// topicMetadata.Brokers will return all the available brokers from the cluster
 	isNumBrokerValid := len(topicMetadata.Brokers) >= expectedReplicationFactor
 	if !isNumBrokerValid {
-		return fmt.Errorf("Current cluster size differs from the expected size. Expected broker: %v NumOfBroker: %v", len(topicMetadata.Brokers), expectedReplicationFactor)
+		return fmt.Errorf("current cluster size differs from the expected size. expected broker: %v NumOfBroker: %v", len(topicMetadata.Brokers), expectedReplicationFactor)
 	}
 
 	// Check the number of Partition per broker, if it is too low create partition
@@ -244,6 +244,6 @@ func (s *Service) initEndToEnd(ctx context.Context) {
 	}
 }
 
-func timeNowMs() int64 {
-	return time.Now().UnixNano() / int64(time.Millisecond)
+func timeNowMs() float64 {
+	return float64(time.Now().UnixNano()) / float64(time.Millisecond)
 }
diff --git a/minion/service.go b/minion/service.go
@@ -36,25 +36,18 @@ type Service struct {
 	storage  *Storage
 
 	// EndToEnd
-	minionID               string // unique identifier, reported in metrics, in case multiple instances run at the same time
-	lastRoundtripTimestamp int64  // creation time (in utc ms) of the message that most recently passed the roundtripSla check
+	minionID               string  // unique identifier, reported in metrics, in case multiple instances run at the same time
+	lastRoundtripTimestamp float64 // creation time (in utc ms) of the message that most recently passed the roundtripSla check
 
 	// EndToEnd Metrics
 	endToEndMessagesProduced  prometheus.Counter
 	endToEndMessagesAcked     prometheus.Counter
 	endToEndMessagesReceived  prometheus.Counter
 	endToEndMessagesCommitted prometheus.Counter
 
-	endToEndWithinAckSla       prometheus.Gauge
-	endToEndWithinRoundtripSla prometheus.Gauge
-	endToEndWithinCommitSla    prometheus.Gauge
-
-	endToEndProduceLatency   prometheus.Histogram
+	endToEndAckLatency       prometheus.Histogram
 	endToEndRoundtripLatency prometheus.Histogram
 	endToEndCommitLatency    prometheus.Histogram
-
-	// todo: produce latency histogram
-
 }
 
 func NewService(cfg Config, logger *zap.Logger, kafkaSvc *kafka.Service, metricNamespace string) (*Service, error) {
@@ -92,14 +85,6 @@ func NewService(cfg Config, logger *zap.Logger, kafkaSvc *kafka.Service, metricN
 
 	// End-to-End metrics
 	if cfg.EndToEnd.Enabled {
-		makeGauge := func(name string, help string) prometheus.Gauge {
-			return promauto.NewGauge(prometheus.GaugeOpts{
-				Namespace: metricNamespace,
-				Subsystem: "end_to_end",
-				Name:      name,
-				Help:      help,
-			})
-		}
 		makeCounter := func(name string, help string) prometheus.Counter {
 			return promauto.NewCounter(prometheus.CounterOpts{
 				Namespace: metricNamespace,
@@ -119,26 +104,18 @@ func NewService(cfg Config, logger *zap.Logger, kafkaSvc *kafka.Service, metricN
 		}
 
 		// Low-level info
-		// Users can construct stuff like "message commits failed" themselves from those
-		service.endToEndMessagesProduced = makeCounter("messages_produced", "Number of messages that kminion's end-to-end test has tried to send to kafka")
-		service.endToEndMessagesAcked = makeCounter("messages_acked", "Number of messages kafka acknowledged as produced")
-		service.endToEndMessagesReceived = makeCounter("messages_received", "Number of *matching* messages kminion received. Every roundtrip message has a minionID (randomly generated on startup) and a timestamp. Kminion only considers a message a match if it it arrives within the configured roundtrip SLA (and it matches the minionID)")
-		service.endToEndMessagesCommitted = makeCounter("messages_committed", "Number of *matching* messages kminion successfully commited as read/processed. See 'messages_received' for what 'matching' means. Kminion will commit late/mismatching messages to kafka as well, but those won't be counted in this metric.")
-
-		// High-level SLA reporting
-		// Simple gauges that report if stuff is within the configured SLAs
-		// Naturally those will potentially not trigger if, for example, only a single message is lost in-between scrap intervals.
-		gaugeHelp := "Will be either 0 (false) or 1 (true), depending on the durations (SLAs) configured in kminion's config"
-		service.endToEndWithinAckSla = makeGauge("is_within_ack_sla", "Reports whether messages can be produced. A message is only considered 'produced' when the broker has sent an ack within the configured timeout. "+gaugeHelp)
-		service.endToEndWithinRoundtripSla = makeGauge("is_within_roundtrip_sla", "Reports whether or not kminion receives the test messages it produces within the configured timeout. "+gaugeHelp)
-		service.endToEndWithinCommitSla = makeGauge("is_within_commit_sla", "Reports whether or not kminion can successfully commit offsets for the messages it receives/processes within the configured timeout. "+gaugeHelp)
+		// Users can construct alerts like "can't produce messages" themselves from those
+		service.endToEndMessagesProduced = makeCounter("messages_produced_total", "Number of messages that kminion's end-to-end test has tried to send to kafka")
+		service.endToEndMessagesAcked = makeCounter("messages_acked_total", "Number of messages kafka acknowledged as produced")
+		service.endToEndMessagesReceived = makeCounter("messages_received_total", "Number of *matching* messages kminion received. Every roundtrip message has a minionID (randomly generated on startup) and a timestamp. Kminion only considers a message a match if it it arrives within the configured roundtrip SLA (and it matches the minionID)")
+		service.endToEndMessagesCommitted = makeCounter("messages_committed_total", "Number of *matching* messages kminion successfully commited as read/processed. See 'messages_received' for what 'matching' means. Kminion will commit late/mismatching messages to kafka as well, but those won't be counted in this metric.")
 
 		// Latency Histograms
 		// More detailed info about how long stuff took
-		// Since histograms also have an 'infinite' bucket, they can be used to detect small hickups that won't trigger the SLA gauges
-		service.endToEndProduceLatency = makeHistogram("produce_latency", cfg.EndToEnd.Producer.AckSla, "Time until we received an ack for a produced message")
-		service.endToEndRoundtripLatency = makeHistogram("roundtrip_latency", cfg.EndToEnd.Consumer.RoundtripSla, "Time it took between sending (producing) and receiving (consuming) a message")
-		service.endToEndCommitLatency = makeHistogram("commit_latency", cfg.EndToEnd.Consumer.CommitSla, "Time kafka took to respond to kminion's offset commit")
+		// Since histograms also have an 'infinite' bucket, they can be used to detect small hickups "lost" messages
+		service.endToEndAckLatency = makeHistogram("produce_latency_seconds", cfg.EndToEnd.Producer.AckSla, "Time until we received an ack for a produced message")
+		service.endToEndRoundtripLatency = makeHistogram("roundtrip_latency_seconds", cfg.EndToEnd.Consumer.RoundtripSla, "Time it took between sending (producing) and receiving (consuming) a message")
+		service.endToEndCommitLatency = makeHistogram("commit_latency_seconds", cfg.EndToEnd.Consumer.CommitSla, "Time kafka took to respond to kminion's offset commit")
 	}
 
 	return service, nil
@@ -236,3 +213,41 @@ func createHistogramBuckets(maxLatency time.Duration) []float64 {
 
 	return bucket
 }
+
+// called from e2e when a message is acknowledged
+func (s *Service) onAck(partitionId int32, duration time.Duration) {
+	s.endToEndMessagesAcked.Inc()
+	s.endToEndAckLatency.Observe(duration.Seconds())
+}
+
+// called from e2e when a message completes a roundtrip (send to kafka, receive msg from kafka again)
+func (s *Service) onRoundtrip(partitionId int32, duration time.Duration) {
+	if duration > s.Cfg.EndToEnd.Consumer.RoundtripSla {
+		return // message is too old
+	}
+
+	// todo: track "lastRoundtripMessage"
+	// if msg.Timestamp < s.lastRoundtripTimestamp {
+	// 	return // msg older than what we recently processed (out of order, should never happen)
+	// }
+
+	s.endToEndMessagesReceived.Inc()
+	s.endToEndRoundtripLatency.Observe(duration.Seconds())
+}
+
+// called from e2e when an offset commit is confirmed
+func (s *Service) onOffsetCommit(partitionId int32, duration time.Duration) {
+
+	// todo:
+	// if the commit took too long, don't count it in 'commits' but add it to the histogram?
+	// and how do we want to handle cases where we get an error??
+	// should we have another metric that tells us about failed commits? or a label on the counter?
+
+	s.endToEndCommitLatency.Observe(duration.Seconds())
+
+	if duration > s.Cfg.EndToEnd.Consumer.CommitSla {
+		return
+	}
+
+	s.endToEndMessagesCommitted.Inc()
+}

Original file line number	Diff line number	Diff line change
`@@ -22,11 +22,11 @@ func (s *Service) validateManagementTopic(ctx context.Context) error {`
`22`	`22`	`// TopicMetadataArray could be empty, therefore needs to do this check beforehand`
`23`	`23`	`topicMetadataArray := topicMetadata.Topics`
`24`	`24`	`if len(topicMetadataArray) == 0 {`
`25`		`- return fmt.Errorf("Unable to retrieve metadata, please make sure the brokers are up and/or you have right to access them")`
	`25`	`+ return fmt.Errorf("unable to retrieve metadata, please make sure the brokers are up and/or you have right to access them")`
`26`	`26`	`}`
`27`	`27`	`doesTopicReachable := topicMetadata.Topics[0].Topic != ""`
`28`	`28`	`if !doesTopicReachable {`
`29`		`- return fmt.Errorf("Unable to retrieve metadata, please make sure the brokers are up and/or you have right to access them")`
	`29`	`+ return fmt.Errorf("unable to retrieve metadata, please make sure the brokers are up and/or you have right to access them")`
`30`	`30`	`}`
`31`	`31`
`32`	`32`	`// Create the management end to end topic if it does not exist`
`@@ -43,7 +43,7 @@ func (s *Service) validateManagementTopic(ctx context.Context) error {`
`43`	`43`	`// topicMetadata.Brokers will return all the available brokers from the cluster`
`44`	`44`	`isNumBrokerValid := len(topicMetadata.Brokers) >= expectedReplicationFactor`
`45`	`45`	`if !isNumBrokerValid {`
`46`		`- return fmt.Errorf("Current cluster size differs from the expected size. Expected broker: %v NumOfBroker: %v", len(topicMetadata.Brokers), expectedReplicationFactor)`
	`46`	`+ return fmt.Errorf("current cluster size differs from the expected size. expected broker: %v NumOfBroker: %v", len(topicMetadata.Brokers), expectedReplicationFactor)`
`47`	`47`	`}`
`48`	`48`
`49`	`49`	`// Check the number of Partition per broker, if it is too low create partition`
`@@ -244,6 +244,6 @@ func (s *Service) initEndToEnd(ctx context.Context) {`
`244`	`244`	`}`
`245`	`245`	`}`
`246`	`246`
`247`		`-func timeNowMs() int64 {`
`248`		`- return time.Now().UnixNano() / int64(time.Millisecond)`
	`247`	`+func timeNowMs() float64 {`
	`248`	`+ return float64(time.Now().UnixNano()) / float64(time.Millisecond)`
`249`	`249`	`}`