Skip to content

Commit cb7c60c

Browse files
authored
Expose hash of the config files used as metric (#2874)
This allows to monitor the roll-out of a new config file across the cluster and can helps to detect a mismatch in active config files. It supports both start-up and runtime config. Signed-off-by: Christian Simon <[email protected]>
1 parent d099a1f commit cb7c60c

File tree

5 files changed

+121
-55
lines changed

5 files changed

+121
-55
lines changed

CHANGELOG.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
* [CHANGE] Added the `engine` label to the metrics exposed by the Prometheus query engine, to distinguish between `ruler` and `querier` metrics. #2854
1818
* [CHANGE] Added ruler to the single binary when started with `-target=all` (default). #2854
1919
* [CHANGE] Experimental TSDB: compact head when opening TSDB. This should only affect ingester startup after it was unable to compact head in previous run. #2870
20+
* [CHANGE] Metric `cortex_overrides_last_reload_successful` has been renamed to `cortex_runtime_config_last_reload_successful`. #2874
2021
* [FEATURE] Introduced `ruler.for-outage-tolerance`, Max time to tolerate outage for restoring "for" state of alert. #2783
2122
* [FEATURE] Introduced `ruler.for-grace-period`, Minimum duration between alert and restored "for" state. This is maintained only for alerts with configured "for" time greater than grace period. #2783
2223
* [FEATURE] Introduced `ruler.resend-delay`, Minimum amount of time to wait before resending an alert to Alertmanager. #2783
@@ -62,6 +63,7 @@
6263
* [ENHANCEMENT] Ruler: `-ruler.alertmanager-url` now supports multiple URLs. Each URL is treated as a separate Alertmanager group. Support for multiple Alertmanagers in a group can be achieved by using DNS service discovery. #2851
6364
* [ENHANCEMENT] Experimental TSDB: Cortex Flusher now works with blocks engine. Flusher needs to be provided with blocks-engine configuration, existing Flusher flags are not used (they are only relevant for chunks engine). Note that flush errors are only reported via log. #2877
6465
* [ENHANCEMENT] Flusher: Added `-flusher.exit-after-flush` option (defaults to true) to control whether Cortex should stop completely after Flusher has finished its work. #2877
66+
* [ENHANCEMENT] Added metrics `cortex_config_hash` and `cortex_runtime_config_hash` to expose hash of the currently active config file. #2874
6567
* [BUGFIX] Fixed a bug in the index intersect code causing storage to return more chunks/series than required. #2796
6668
* [BUGFIX] Fixed the number of reported keys in the background cache queue. #2764
6769
* [BUGFIX] Fix race in processing of headers in sharded queries. #2762

cmd/cortex/main.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package main
22

33
import (
4+
"crypto/sha256"
45
"flag"
56
"fmt"
67
"io/ioutil"
@@ -29,11 +30,21 @@ var (
2930
Revision string
3031
)
3132

33+
// configHash exposes information about the loaded config
34+
var configHash *prometheus.GaugeVec = prometheus.NewGaugeVec(
35+
prometheus.GaugeOpts{
36+
Name: "cortex_config_hash",
37+
Help: "Hash of the currently active config file.",
38+
},
39+
[]string{"sha256"},
40+
)
41+
3242
func init() {
3343
version.Version = Version
3444
version.Branch = Branch
3545
version.Revision = Revision
3646
prometheus.MustRegister(version.NewCollector("cortex"))
47+
prometheus.MustRegister(configHash)
3748
}
3849

3950
const (
@@ -184,6 +195,12 @@ func LoadConfig(filename string, expandENV bool, cfg *cortex.Config) error {
184195
return errors.Wrap(err, "Error reading config file")
185196
}
186197

198+
// create a sha256 hash of the config before expansion and expose it via
199+
// the config_info metric
200+
hash := sha256.Sum256(buf)
201+
configHash.Reset()
202+
configHash.WithLabelValues(fmt.Sprintf("%x", hash)).Set(1)
203+
187204
if expandENV {
188205
buf = expandEnv(buf)
189206
}

pkg/cortex/runtime_config.go

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
package cortex
22

33
import (
4-
"os"
4+
"io"
55

66
"gopkg.in/yaml.v2"
77

@@ -19,15 +19,10 @@ type runtimeConfigValues struct {
1919
Multi kv.MultiRuntimeConfig `yaml:"multi_kv_config"`
2020
}
2121

22-
func loadRuntimeConfig(filename string) (interface{}, error) {
23-
f, err := os.Open(filename)
24-
if err != nil {
25-
return nil, err
26-
}
27-
22+
func loadRuntimeConfig(r io.Reader) (interface{}, error) {
2823
var overrides = &runtimeConfigValues{}
2924

30-
decoder := yaml.NewDecoder(f)
25+
decoder := yaml.NewDecoder(r)
3126
decoder.SetStrict(true)
3227
if err := decoder.Decode(&overrides); err != nil {
3328
return nil, err

pkg/util/runtimeconfig/manager.go

Lines changed: 36 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,36 @@
11
package runtimeconfig
22

33
import (
4+
"bytes"
45
"context"
6+
"crypto/sha256"
7+
"errors"
58
"flag"
9+
"fmt"
10+
"io"
11+
"io/ioutil"
612
"sync"
713
"time"
814

915
"github.com/go-kit/kit/log/level"
1016
"github.com/prometheus/client_golang/prometheus"
17+
"github.com/prometheus/client_golang/prometheus/promauto"
1118

1219
"github.com/cortexproject/cortex/pkg/util"
1320
"github.com/cortexproject/cortex/pkg/util/services"
1421
)
1522

1623
// Loader loads the configuration from file.
17-
type Loader func(filename string) (interface{}, error)
24+
type Loader func(r io.Reader) (interface{}, error)
1825

1926
// ManagerConfig holds the config for an Manager instance.
2027
// It holds config related to loading per-tenant config.
2128
type ManagerConfig struct {
2229
ReloadPeriod time.Duration `yaml:"period"`
23-
LoadPath string `yaml:"file"`
24-
Loader Loader `yaml:"-"`
30+
// LoadPath contains the path to the runtime config file, requires an
31+
// non-empty value
32+
LoadPath string `yaml:"file"`
33+
Loader Loader `yaml:"-"`
2534
}
2635

2736
// RegisterFlags registers flags.
@@ -44,20 +53,25 @@ type Manager struct {
4453
config interface{}
4554

4655
configLoadSuccess prometheus.Gauge
56+
configHash *prometheus.GaugeVec
4757
}
4858

4959
// NewRuntimeConfigManager creates an instance of Manager and starts reload config loop based on config
5060
func NewRuntimeConfigManager(cfg ManagerConfig, registerer prometheus.Registerer) (*Manager, error) {
61+
if cfg.LoadPath == "" {
62+
return nil, errors.New("LoadPath is empty")
63+
}
64+
5165
mgr := Manager{
5266
cfg: cfg,
53-
configLoadSuccess: prometheus.NewGauge(prometheus.GaugeOpts{
54-
Name: "cortex_overrides_last_reload_successful",
55-
Help: "Whether the last config reload attempt was successful.",
67+
configLoadSuccess: promauto.With(registerer).NewGauge(prometheus.GaugeOpts{
68+
Name: "cortex_runtime_config_last_reload_successful",
69+
Help: "Whether the last runtime-config reload attempt was successful.",
5670
}),
57-
}
58-
59-
if registerer != nil {
60-
registerer.MustRegister(mgr.configLoadSuccess)
71+
configHash: promauto.With(registerer).NewGaugeVec(prometheus.GaugeOpts{
72+
Name: "cortex_runtime_config_hash",
73+
Help: "Hash of the currently active runtime config file.",
74+
}, []string{"sha256"}),
6175
}
6276

6377
mgr.Service = services.NewBasicService(mgr.start, mgr.loop, mgr.stop)
@@ -131,7 +145,14 @@ func (om *Manager) loop(ctx context.Context) error {
131145
// loadConfig loads configuration using the loader function, and if successful,
132146
// stores it as current configuration and notifies listeners.
133147
func (om *Manager) loadConfig() error {
134-
cfg, err := om.cfg.Loader(om.cfg.LoadPath)
148+
buf, err := ioutil.ReadFile(om.cfg.LoadPath)
149+
if err != nil {
150+
om.configLoadSuccess.Set(0)
151+
return err
152+
}
153+
hash := sha256.Sum256(buf)
154+
155+
cfg, err := om.cfg.Loader(bytes.NewReader(buf))
135156
if err != nil {
136157
om.configLoadSuccess.Set(0)
137158
return err
@@ -141,6 +162,10 @@ func (om *Manager) loadConfig() error {
141162
om.setConfig(cfg)
142163
om.callListeners(cfg)
143164

165+
// expose hash of runtime config
166+
om.configHash.Reset()
167+
om.configHash.WithLabelValues(fmt.Sprintf("%x", hash[:])).Set(1)
168+
144169
return nil
145170
}
146171

pkg/util/runtimeconfig/manager_test.go

Lines changed: 63 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,18 @@ package runtimeconfig
22

33
import (
44
"context"
5+
"crypto/sha256"
6+
"fmt"
7+
"io"
58
"io/ioutil"
69
"os"
10+
"strings"
711
"testing"
812
"time"
913

14+
"github.com/prometheus/client_golang/prometheus"
15+
"github.com/prometheus/client_golang/prometheus/testutil"
16+
"github.com/stretchr/testify/assert"
1017
"github.com/stretchr/testify/require"
1118
"go.uber.org/atomic"
1219
"gopkg.in/yaml.v2"
@@ -35,24 +42,40 @@ func (l *TestLimits) UnmarshalYAML(unmarshal func(interface{}) error) error {
3542
return unmarshal((*plain)(l))
3643
}
3744

38-
func testLoadOverrides(filename string) (interface{}, error) {
39-
f, err := os.Open(filename)
40-
if err != nil {
41-
return nil, err
42-
}
43-
defer f.Close()
44-
45+
func testLoadOverrides(r io.Reader) (interface{}, error) {
4546
var overrides = &testOverrides{}
4647

47-
decoder := yaml.NewDecoder(f)
48+
decoder := yaml.NewDecoder(r)
4849
decoder.SetStrict(true)
4950
if err := decoder.Decode(&overrides); err != nil {
5051
return nil, err
5152
}
52-
5353
return overrides, nil
5454
}
5555

56+
func newTestOverridesManagerConfig(t *testing.T, i int32) (*atomic.Int32, ManagerConfig) {
57+
var config = atomic.NewInt32(i)
58+
59+
// create empty file
60+
tempFile, err := ioutil.TempFile("", "test-validation")
61+
require.NoError(t, err)
62+
63+
t.Cleanup(func() {
64+
tempFile.Close()
65+
os.Remove(tempFile.Name())
66+
})
67+
68+
// testing NewRuntimeConfigManager with overrides reload config set
69+
return config, ManagerConfig{
70+
ReloadPeriod: 5 * time.Second,
71+
LoadPath: tempFile.Name(),
72+
Loader: func(_ io.Reader) (i interface{}, err error) {
73+
val := int(config.Load())
74+
return val, nil
75+
},
76+
}
77+
}
78+
5679
func TestNewOverridesManager(t *testing.T) {
5780
tempFile, err := ioutil.TempFile("", "test-validation")
5881
require.NoError(t, err)
@@ -98,9 +121,10 @@ func TestOverridesManager_ListenerWithDefaultLimits(t *testing.T) {
98121
require.NoError(t, os.Remove(tempFile.Name()))
99122
}()
100123

101-
err = ioutil.WriteFile(tempFile.Name(), []byte(`overrides:
124+
config := []byte(`overrides:
102125
user1:
103-
limit2: 150`), 0600)
126+
limit2: 150`)
127+
err = ioutil.WriteFile(tempFile.Name(), config, 0600)
104128
require.NoError(t, err)
105129

106130
defaultTestLimits = &TestLimits{Limit1: 100}
@@ -112,17 +136,30 @@ func TestOverridesManager_ListenerWithDefaultLimits(t *testing.T) {
112136
Loader: testLoadOverrides,
113137
}
114138

115-
overridesManager, err := NewRuntimeConfigManager(overridesManagerConfig, nil)
139+
reg := prometheus.NewPedanticRegistry()
140+
141+
overridesManager, err := NewRuntimeConfigManager(overridesManagerConfig, reg)
116142
require.NoError(t, err)
117143
require.NoError(t, services.StartAndAwaitRunning(context.Background(), overridesManager))
118144

145+
// check if the metrics is set to the config map value before
146+
assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(fmt.Sprintf(`
147+
# HELP cortex_runtime_config_hash Hash of the currently active runtime config file.
148+
# TYPE cortex_runtime_config_hash gauge
149+
cortex_runtime_config_hash{sha256="%s"} 1
150+
# HELP cortex_runtime_config_last_reload_successful Whether the last runtime-config reload attempt was successful.
151+
# TYPE cortex_runtime_config_last_reload_successful gauge
152+
cortex_runtime_config_last_reload_successful 1
153+
`, fmt.Sprintf("%x", sha256.Sum256(config))))))
154+
119155
// need to use buffer, otherwise loadConfig will throw away update
120156
ch := overridesManager.CreateListenerChannel(1)
121157

122158
// rewrite file
123-
err = ioutil.WriteFile(tempFile.Name(), []byte(`overrides:
159+
config = []byte(`overrides:
124160
user2:
125-
limit2: 200`), 0600)
161+
limit2: 200`)
162+
err = ioutil.WriteFile(tempFile.Name(), config, 0600)
126163
require.NoError(t, err)
127164

128165
// reload
@@ -141,6 +178,16 @@ func TestOverridesManager_ListenerWithDefaultLimits(t *testing.T) {
141178
require.Equal(t, 200, to.Overrides["user2"].Limit2) // new overrides
142179
require.Equal(t, 100, to.Overrides["user2"].Limit1) // from defaults
143180

181+
// check if the metrics have been updated
182+
assert.NoError(t, testutil.GatherAndCompare(reg, strings.NewReader(fmt.Sprintf(`
183+
# HELP cortex_runtime_config_hash Hash of the currently active runtime config file.
184+
# TYPE cortex_runtime_config_hash gauge
185+
cortex_runtime_config_hash{sha256="%s"} 1
186+
# HELP cortex_runtime_config_last_reload_successful Whether the last runtime-config reload attempt was successful.
187+
# TYPE cortex_runtime_config_last_reload_successful gauge
188+
cortex_runtime_config_last_reload_successful 1
189+
`, fmt.Sprintf("%x", sha256.Sum256(config))))))
190+
144191
// Cleaning up
145192
require.NoError(t, services.StopAndAwaitTerminated(context.Background(), overridesManager))
146193

@@ -149,17 +196,7 @@ func TestOverridesManager_ListenerWithDefaultLimits(t *testing.T) {
149196
}
150197

151198
func TestOverridesManager_ListenerChannel(t *testing.T) {
152-
var config = atomic.NewInt32(555)
153-
154-
// testing NewRuntimeConfigManager with overrides reload config set
155-
overridesManagerConfig := ManagerConfig{
156-
ReloadPeriod: 5 * time.Second,
157-
LoadPath: "ignored",
158-
Loader: func(filename string) (i interface{}, err error) {
159-
val := int(config.Load())
160-
return val, nil
161-
},
162-
}
199+
config, overridesManagerConfig := newTestOverridesManagerConfig(t, 555)
163200

164201
overridesManager, err := NewRuntimeConfigManager(overridesManagerConfig, nil)
165202
require.NoError(t, err)
@@ -199,17 +236,7 @@ func TestOverridesManager_ListenerChannel(t *testing.T) {
199236
}
200237

201238
func TestOverridesManager_StopClosesListenerChannels(t *testing.T) {
202-
var config = atomic.NewInt32(555)
203-
204-
// testing NewRuntimeConfigManager with overrides reload config set
205-
overridesManagerConfig := ManagerConfig{
206-
ReloadPeriod: 5 * time.Second,
207-
LoadPath: "ignored",
208-
Loader: func(filename string) (i interface{}, err error) {
209-
val := int(config.Load())
210-
return val, nil
211-
},
212-
}
239+
_, overridesManagerConfig := newTestOverridesManagerConfig(t, 555)
213240

214241
overridesManager, err := NewRuntimeConfigManager(overridesManagerConfig, nil)
215242
require.NoError(t, err)

0 commit comments

Comments
 (0)