-
Notifications
You must be signed in to change notification settings - Fork 833
Use BasicService in state_replication.go for replication loop. #3930
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -137,7 +137,13 @@ func New(cfg *Config, reg *prometheus.Registry) (*Alertmanager, error) { | |||||
am.state = cfg.Peer | ||||||
} else if cfg.ShardingEnabled { | ||||||
level.Debug(am.logger).Log("msg", "starting tenant alertmanager with ring-based replication") | ||||||
am.state = newReplicatedStates(cfg.UserID, cfg.ReplicationFactor, cfg.ReplicateStateFunc, cfg.GetPositionFunc, am.stop, am.logger, am.registry) | ||||||
state := newReplicatedStates(cfg.UserID, cfg.ReplicationFactor, cfg.ReplicateStateFunc, cfg.GetPositionFunc, am.logger, am.registry) | ||||||
|
||||||
if err := state.Service.StartAsync(context.Background()); err != nil { | ||||||
return nil, fmt.Errorf("failed to start state replication service: %v", err) | ||||||
stevesg marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||||||
} | ||||||
|
||||||
am.state = state | ||||||
} else { | ||||||
level.Debug(am.logger).Log("msg", "starting tenant alertmanager without replication") | ||||||
am.state = &NilPeer{} | ||||||
|
@@ -319,12 +325,21 @@ func (am *Alertmanager) Stop() { | |||||
am.dispatcher.Stop() | ||||||
} | ||||||
|
||||||
if service, ok := am.state.(*state); ok { | ||||||
|
if service, ok := am.state.(*state); ok { | |
if service, ok := am.state.(services.Service); ok { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That being said, the next step of refactoring I see (but I would do it in a separate PR) is to require am.state
to implement the services.Service
interface. We can wrap cfg.Peer
into our own struct
which implements services.Service
(using services.NewIdleService()
) and add services.Service
to NilPeer
too. This way we keep all implementations consistent.
Thoughts @stevesg @pstibrany ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Seems reasonable. We should consider how to call Settle
for the non-sharded case too, ideally that would be consistent (the Service wrapper for the Peer wrapper does the settling in starting, as for the replication).
The alternative to the road we've started on, is to have a single "PeerService" which wraps any type of Peer, though that would then need to be extended with some sort of "run" function, so you probably end back at making them all services...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
so you probably end back at making them all services...
I think having separate services would keep things cleaner at the end. Code duplication should be minimal.
stevesg marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
stevesg marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,10 +13,14 @@ import ( | |
"github.com/prometheus/alertmanager/cluster" | ||
"github.com/prometheus/alertmanager/cluster/clusterpb" | ||
"github.com/prometheus/client_golang/prometheus" | ||
|
||
"github.com/cortexproject/cortex/pkg/util/services" | ||
) | ||
|
||
// state represents the Alertmanager silences and notification log internal state. | ||
type state struct { | ||
services.Service | ||
|
||
userID string | ||
logger log.Logger | ||
reg prometheus.Registerer | ||
|
@@ -34,12 +38,11 @@ type state struct { | |
stateReplicationFailed *prometheus.CounterVec | ||
|
||
msgc chan *clusterpb.Part | ||
stopc chan struct{} | ||
readyc chan struct{} | ||
} | ||
|
||
// newReplicatedStates creates a new state struct, which manages state to be replicated between alertmanagers. | ||
func newReplicatedStates(userID string, rf int, f func(context.Context, string, *clusterpb.Part) error, pf func(string) int, stopc chan struct{}, l log.Logger, r prometheus.Registerer) *state { | ||
func newReplicatedStates(userID string, rf int, f func(context.Context, string, *clusterpb.Part) error, pf func(string) int, l log.Logger, r prometheus.Registerer) *state { | ||
|
||
s := &state{ | ||
logger: l, | ||
|
@@ -50,7 +53,6 @@ func newReplicatedStates(userID string, rf int, f func(context.Context, string, | |
states: make(map[string]cluster.State, 2), // we use two, one for the notifications and one for silences. | ||
msgc: make(chan *clusterpb.Part), | ||
readyc: make(chan struct{}), | ||
stopc: stopc, | ||
reg: r, | ||
partialStateMergesTotal: promauto.With(r).NewCounterVec(prometheus.CounterOpts{ | ||
Name: "alertmanager_partial_state_merges_total", | ||
|
@@ -70,7 +72,8 @@ func newReplicatedStates(userID string, rf int, f func(context.Context, string, | |
}, []string{"key"}), | ||
} | ||
|
||
go s.handleBroadcasts() | ||
s.Service = services.NewBasicService(nil, s.running, nil) | ||
|
||
return s | ||
} | ||
|
||
|
@@ -142,23 +145,22 @@ func (s *state) Ready() bool { | |
return false | ||
} | ||
|
||
func (s *state) handleBroadcasts() { | ||
func (s *state) running(ctx context.Context) error { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let me leave some comments here, but not related to
Thoughts @stevesg @pstibrany ? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like the idea of implementing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Makes sense to me. I was just going to ask why do we still need There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Love this idea as well! |
||
for { | ||
select { | ||
case p := <-s.msgc: | ||
// If the replication factor is <= 1, we don't need to replicate any state anywhere else. | ||
if s.replicationFactor <= 1 { | ||
return | ||
return nil | ||
} | ||
|
||
s.stateReplicationTotal.WithLabelValues(p.Key).Inc() | ||
ctx := context.Background() //TODO: I probably need a better context | ||
if err := s.replicateStateFunc(ctx, s.userID, p); err != nil { | ||
s.stateReplicationFailed.WithLabelValues(p.Key).Inc() | ||
level.Error(s.logger).Log("msg", "failed to replicate state to other alertmanagers", "user", s.userID, "key", p.Key, "err", err) | ||
} | ||
case <-s.stopc: | ||
return | ||
case <-ctx.Done(): | ||
return nil | ||
} | ||
} | ||
} | ||
|
Uh oh!
There was an error while loading. Please reload this page.