move end-to-end into its own package called e2e

rikimaru0345 · rikimaru0345 · commit 1349666d78bd · 2021-05-03T15:35:10.000+02:00
diff --git a/e2e/config.go b/e2e/config.go
@@ -1,27 +1,27 @@
-package minion
+package e2e
 
 import (
 	"fmt"
 	"time"
 )
 
-type EndToEndConfig struct {
+type Config struct {
 	Enabled         bool                   `koanf:"enabled"`
 	TopicManagement EndToEndTopicConfig    `koanf:"topicManagement"`
 	ProbeInterval   time.Duration          `koanf:"probeInterval"`
 	Producer        EndToEndProducerConfig `koanf:"producer"`
 	Consumer        EndToEndConsumerConfig `koanf:"consumer"`
 }
 
-func (c *EndToEndConfig) SetDefaults() {
+func (c *Config) SetDefaults() {
 	c.Enabled = false
 	c.ProbeInterval = 2 * time.Second
 	c.TopicManagement.SetDefaults()
 	c.Producer.SetDefaults()
 	c.Consumer.SetDefaults()
 }
 
-func (c *EndToEndConfig) Validate() error {
+func (c *Config) Validate() error {
 
 	if !c.Enabled {
 		return nil
diff --git a/e2e/config_consumer.go b/e2e/config_consumer.go
@@ -1,4 +1,4 @@
-package minion
+package e2e
 
 import (
 	"fmt"
diff --git a/e2e/config_producer.go b/e2e/config_producer.go
@@ -1,4 +1,4 @@
-package minion
+package e2e
 
 import (
 	"fmt"
diff --git a/e2e/config_topic.go b/e2e/config_topic.go
@@ -1,4 +1,4 @@
-package minion
+package e2e
 
 import (
 	"fmt"
diff --git a/e2e/consumer.go b/e2e/consumer.go
@@ -1,4 +1,4 @@
-package minion
+package e2e
 
 import (
 	"context"
@@ -13,10 +13,10 @@ import (
 
 func (s *Service) ConsumeFromManagementTopic(ctx context.Context) error {
 	client := s.kafkaSvc.Client
-	topicName := s.Cfg.EndToEnd.TopicManagement.Name
+	topicName := s.config.TopicManagement.Name
 	topic := kgo.ConsumeTopics(kgo.NewOffset().AtEnd(), topicName)
 	balancer := kgo.Balancers(kgo.CooperativeStickyBalancer()) // Default GroupBalancer
-	switch s.Cfg.EndToEnd.Consumer.RebalancingProtocol {
+	switch s.config.Consumer.RebalancingProtocol {
 	case RoundRobin:
 		balancer = kgo.Balancers(kgo.RoundRobinBalancer())
 	case Range:
@@ -28,7 +28,7 @@ func (s *Service) ConsumeFromManagementTopic(ctx context.Context) error {
 
 	// todo: use minionID as part of group id
 	//
-	client.AssignGroup(s.Cfg.EndToEnd.Consumer.GroupId, kgo.GroupTopics(topicName), balancer, kgo.DisableAutoCommit())
+	client.AssignGroup(s.config.Consumer.GroupId, kgo.GroupTopics(topicName), balancer, kgo.DisableAutoCommit())
 	s.logger.Info("Starting to consume " + topicName)
 
 	for {
diff --git a/e2e/producer.go b/e2e/producer.go
@@ -1,4 +1,4 @@
-package minion
+package e2e
 
 import (
 	"context"
@@ -17,7 +17,7 @@ type EndToEndMessage struct {
 
 func (s *Service) produceToManagementTopic(ctx context.Context) error {
 
-	topicName := s.Cfg.EndToEnd.TopicManagement.Name
+	topicName := s.config.TopicManagement.Name
 
 	record, err := createEndToEndRecord(topicName, s.minionID)
 	if err != nil {
diff --git a/e2e/service.go b/e2e/service.go
@@ -0,0 +1,134 @@
+package e2e
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"github.com/cloudhut/kminion/v2/kafka"
+	"github.com/google/uuid"
+	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/promauto"
+	"github.com/twmb/franz-go/pkg/kgo"
+	"go.uber.org/zap"
+)
+
+type Service struct {
+	// General
+	config Config
+	logger *zap.Logger
+
+	kafkaSvc *kafka.Service // creates kafka client for us
+	client   *kgo.Client
+
+	// Service
+	minionID               string  // unique identifier, reported in metrics, in case multiple instances run at the same time
+	lastRoundtripTimestamp float64 // creation time (in utc ms) of the message that most recently passed the roundtripSla check
+
+	// Metrics
+	endToEndMessagesProduced  prometheus.Counter
+	endToEndMessagesAcked     prometheus.Counter
+	endToEndMessagesReceived  prometheus.Counter
+	endToEndMessagesCommitted prometheus.Counter
+
+	endToEndAckLatency       prometheus.Histogram
+	endToEndRoundtripLatency prometheus.Histogram
+	endToEndCommitLatency    prometheus.Histogram
+}
+
+// NewService creates a new instance of the e2e moinitoring service (wow)
+func NewService(cfg Config, logger *zap.Logger, kafkaSvc *kafka.Service, metricNamespace string) (*Service, error) {
+
+	svc := &Service{
+		config:   cfg,
+		logger:   logger,
+		kafkaSvc: kafkaSvc,
+		client:   nil,
+
+		minionID: uuid.NewString(),
+	}
+
+	makeCounter := func(name string, help string) prometheus.Counter {
+		return promauto.NewCounter(prometheus.CounterOpts{
+			Namespace: metricNamespace,
+			Subsystem: "end_to_end",
+			Name:      name,
+			Help:      help,
+		})
+	}
+	makeHistogram := func(name string, maxLatency time.Duration, help string) prometheus.Histogram {
+		return promauto.NewHistogram(prometheus.HistogramOpts{
+			Namespace: metricNamespace,
+			Subsystem: "end_to_end",
+			Name:      name,
+			Help:      help,
+			Buckets:   createHistogramBuckets(maxLatency),
+		})
+	}
+
+	// Low-level info
+	// Users can construct alerts like "can't produce messages" themselves from those
+	svc.endToEndMessagesProduced = makeCounter("messages_produced_total", "Number of messages that kminion's end-to-end test has tried to send to kafka")
+	svc.endToEndMessagesAcked = makeCounter("messages_acked_total", "Number of messages kafka acknowledged as produced")
+	svc.endToEndMessagesReceived = makeCounter("messages_received_total", "Number of *matching* messages kminion received. Every roundtrip message has a minionID (randomly generated on startup) and a timestamp. Kminion only considers a message a match if it it arrives within the configured roundtrip SLA (and it matches the minionID)")
+	svc.endToEndMessagesCommitted = makeCounter("messages_committed_total", "Number of *matching* messages kminion successfully commited as read/processed. See 'messages_received' for what 'matching' means. Kminion will commit late/mismatching messages to kafka as well, but those won't be counted in this metric.")
+
+	// Latency Histograms
+	// More detailed info about how long stuff took
+	// Since histograms also have an 'infinite' bucket, they can be used to detect small hickups "lost" messages
+	svc.endToEndAckLatency = makeHistogram("produce_latency_seconds", cfg.Producer.AckSla, "Time until we received an ack for a produced message")
+	svc.endToEndRoundtripLatency = makeHistogram("roundtrip_latency_seconds", cfg.Consumer.RoundtripSla, "Time it took between sending (producing) and receiving (consuming) a message")
+	svc.endToEndCommitLatency = makeHistogram("commit_latency_seconds", cfg.Consumer.CommitSla, "Time kafka took to respond to kminion's offset commit")
+
+	return svc, nil
+}
+
+// Start starts the service (wow)
+func (s *Service) Start(ctx context.Context) error {
+
+	if err := s.validateManagementTopic(ctx); err != nil {
+		return fmt.Errorf("could not validate end-to-end topic: %w", err)
+	}
+
+	go s.initEndToEnd(ctx)
+
+	return nil
+}
+
+// called from e2e when a message is acknowledged
+func (s *Service) onAck(partitionId int32, duration time.Duration) {
+	s.endToEndMessagesAcked.Inc()
+	s.endToEndAckLatency.Observe(duration.Seconds())
+}
+
+// called from e2e when a message completes a roundtrip (send to kafka, receive msg from kafka again)
+func (s *Service) onRoundtrip(partitionId int32, duration time.Duration) {
+	if duration > s.config.Consumer.RoundtripSla {
+		return // message is too old
+	}
+
+	// todo: track "lastRoundtripMessage"
+	// if msg.Timestamp < s.lastRoundtripTimestamp {
+	// 	return // msg older than what we recently processed (out of order, should never happen)
+	// }
+
+	s.endToEndMessagesReceived.Inc()
+	s.endToEndRoundtripLatency.Observe(duration.Seconds())
+}
+
+// called from e2e when an offset commit is confirmed
+func (s *Service) onOffsetCommit(partitionId int32, duration time.Duration) {
+
+	// todo:
+	// if the commit took too long, don't count it in 'commits' but add it to the histogram?
+	// and how do we want to handle cases where we get an error??
+	// should we have another metric that tells us about failed commits? or a label on the counter?
+
+	s.endToEndCommitLatency.Observe(duration.Seconds())
+
+	if duration > s.config.Consumer.CommitSla {
+		return
+	}
+
+	s.endToEndMessagesCommitted.Inc()
+}
diff --git a/e2e/topic.go b/e2e/topic.go
@@ -1,4 +1,4 @@
-package minion
+package e2e
 
 import (
 	"context"
@@ -11,8 +11,10 @@ import (
 
 func (s *Service) validateManagementTopic(ctx context.Context) error {
 
-	expectedReplicationFactor := s.Cfg.EndToEnd.TopicManagement.ReplicationFactor
-	expectedNumPartitionsPerBroker := s.Cfg.EndToEnd.TopicManagement.PartitionsPerBroker
+	s.logger.Info("validating end-to-end topic...")
+
+	expectedReplicationFactor := s.config.TopicManagement.ReplicationFactor
+	expectedNumPartitionsPerBroker := s.config.TopicManagement.PartitionsPerBroker
 	topicMetadata, err := s.getTopicMetadata(ctx)
 	if err != nil {
 		return err
@@ -43,7 +45,7 @@ func (s *Service) validateManagementTopic(ctx context.Context) error {
 	// topicMetadata.Brokers will return all the available brokers from the cluster
 	isNumBrokerValid := len(topicMetadata.Brokers) >= expectedReplicationFactor
 	if !isNumBrokerValid {
-		return fmt.Errorf("current cluster size differs from the expected size. expected broker: %v NumOfBroker: %v", len(topicMetadata.Brokers), expectedReplicationFactor)
+		return fmt.Errorf("current cluster size differs from the expected size (based on config topicManagement.replicationFactor). expected broker: %v NumOfBroker: %v", len(topicMetadata.Brokers), expectedReplicationFactor)
 	}
 
 	// Check the number of Partition per broker, if it is too low create partition
@@ -55,7 +57,7 @@ func (s *Service) validateManagementTopic(ctx context.Context) error {
 		assignment.Replicas = topicMetadata.Topics[0].Partitions[0].Replicas
 
 		topic := kmsg.NewCreatePartitionsRequestTopic()
-		topic.Topic = s.Cfg.EndToEnd.TopicManagement.Name
+		topic.Topic = s.config.TopicManagement.Name
 		topic.Count = int32(expectedNumPartitionsPerBroker) // Should be greater than current partition number
 		topic.Assignment = []kmsg.CreatePartitionsRequestTopicAssignment{assignment}
 
@@ -86,17 +88,17 @@ func (s *Service) validateManagementTopic(ctx context.Context) error {
 			}
 		}
 	}
-	assignmentInvalid := len(distinctLeaderNodes) != s.Cfg.EndToEnd.TopicManagement.ReplicationFactor
+	assignmentInvalid := len(distinctLeaderNodes) != s.config.TopicManagement.ReplicationFactor
 	// Reassign Partitions on invalid assignment
 	if assignmentInvalid {
 		// Get the new AssignedReplicas by checking the ReplicationFactor config
-		assignedReplicas := make([]int32, s.Cfg.EndToEnd.TopicManagement.ReplicationFactor)
+		assignedReplicas := make([]int32, s.config.TopicManagement.ReplicationFactor)
 		for index := range assignedReplicas {
 			assignedReplicas[index] = int32(index)
 		}
 
 		// Generate the partition assignments from PartitionPerBroker config
-		partitions := make([]int32, s.Cfg.EndToEnd.TopicManagement.PartitionsPerBroker)
+		partitions := make([]int32, s.config.TopicManagement.PartitionsPerBroker)
 		reassignedPartitions := []kmsg.AlterPartitionAssignmentsRequestTopicPartition{}
 		for index := range partitions {
 			rp := kmsg.NewAlterPartitionAssignmentsRequestTopicPartition()
@@ -106,7 +108,7 @@ func (s *Service) validateManagementTopic(ctx context.Context) error {
 		}
 
 		managamentTopicReassignment := kmsg.NewAlterPartitionAssignmentsRequestTopic()
-		managamentTopicReassignment.Topic = s.Cfg.EndToEnd.TopicManagement.Name
+		managamentTopicReassignment.Topic = s.config.TopicManagement.Name
 		managamentTopicReassignment.Partitions = reassignedPartitions
 
 		reassignment := kmsg.NewAlterPartitionAssignmentsRequest()
@@ -153,9 +155,9 @@ func createTopicConfig(cfgTopic EndToEndTopicConfig) []kmsg.CreateTopicsRequestT
 
 func (s *Service) createManagementTopic(ctx context.Context, topicMetadata *kmsg.MetadataResponse) error {
 
-	s.logger.Info(fmt.Sprintf("creating topic %s for EndToEnd metrics", s.Cfg.EndToEnd.TopicManagement.Name))
+	s.logger.Info(fmt.Sprintf("creating topic %s for EndToEnd metrics", s.config.TopicManagement.Name))
 
-	cfgTopic := s.Cfg.EndToEnd.TopicManagement
+	cfgTopic := s.config.TopicManagement
 	topicConfigs := createTopicConfig(cfgTopic)
 
 	topic := kmsg.NewCreateTopicsRequestTopic()
@@ -200,7 +202,7 @@ func (s *Service) createManagementTopic(ctx context.Context, topicMetadata *kmsg
 
 func (s *Service) getTopicMetadata(ctx context.Context) (*kmsg.MetadataResponse, error) {
 
-	cfg := s.Cfg.EndToEnd.TopicManagement
+	cfg := s.config.TopicManagement
 	topicReq := kmsg.NewMetadataRequestTopic()
 	topicReq.Topic = &cfg.Name
 
@@ -217,31 +219,38 @@ func (s *Service) getTopicMetadata(ctx context.Context) (*kmsg.MetadataResponse,
 
 func (s *Service) initEndToEnd(ctx context.Context) {
 
-	reconciliationInterval := s.Cfg.EndToEnd.TopicManagement.ReconciliationInterval
-	c1 := make(chan error, 1)
+	validateTopicTicker := time.NewTicker(s.config.TopicManagement.ReconciliationInterval)
+	produceTicker := time.NewTicker(s.config.ProbeInterval)
+	// stop tickers when context is cancelled
+	go func() {
+		<-ctx.Done()
+		produceTicker.Stop()
+		validateTopicTicker.Stop()
+	}()
 
-	// Run long running function on validating or reconciling that might be timeout
+	// keep checking end-to-end topic
 	go func() {
-		err := s.validateManagementTopic(ctx)
-		c1 <- err
+		for range validateTopicTicker.C {
+			err := s.validateManagementTopic(ctx)
+			if err != nil {
+				s.logger.Error("failed to validate end-to-end topic: %w", zap.Error(err))
+			}
+		}
 	}()
 
-	// Listen on our channel AND a timeout channel - which ever happens first.
-	select {
-	case err := <-c1:
-		s.logger.Warn("failed to validate management topic for endtoend metrics", zap.Error(err))
-		return
-	case <-time.After(reconciliationInterval):
-		s.logger.Warn("time exceeded while validating/reconciling management topic of endtoend metrics")
-		return
-	default:
-		go s.ConsumeFromManagementTopic(ctx)
-
-		t := time.NewTicker(s.Cfg.EndToEnd.ProbeInterval)
-		for range t.C {
-			s.produceToManagementTopic(ctx)
+	// start consuming topic
+	go s.ConsumeFromManagementTopic(ctx)
+
+	// start producing to topic
+	go func() {
+		for range produceTicker.C {
+			err := s.produceToManagementTopic(ctx)
+			if err != nil {
+				s.logger.Error("failed to produce to end-to-end topic: %w", zap.Error(err))
+			}
 		}
-	}
+	}()
+
 }
 
 func timeNowMs() float64 {
diff --git a/e2e/utils.go b/e2e/utils.go
diff --git a/minion/config.go b/minion/config.go
diff --git a/minion/service.go b/minion/service.go

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-package minion`
	`1`	`+package e2e`
`2`	`2`
`3`	`3`	`import (`
`4`	`4`	`"fmt"`