Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a periodic test of the autoseal to detect loss of connectivity. #13078

Merged
merged 17 commits into from
Nov 10, 2021
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions changelog/13078.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
core: Periodically test the health of connectivity to auto-seal backends
```
8 changes: 8 additions & 0 deletions vault/core.go
Original file line number Diff line number Diff line change
Expand Up @@ -2143,6 +2143,10 @@ func (c *Core) postUnseal(ctx context.Context, ctxCancelFunc context.CancelFunc,
if err := seal.UpgradeKeys(c.activeContext); err != nil {
c.logger.Warn("post-unseal upgrade seal keys failed", "error", err)
}

// Start a periodic but infrequent heartbeat to detect auto-seal backend outages at runtime rather than being
// surprised by this at the next need to unseal.
seal.StartHealthCheck()
}

c.metricsCh = make(chan struct{})
Expand Down Expand Up @@ -2224,6 +2228,10 @@ func (c *Core) preSeal() error {
c.autoRotateCancel = nil
}

if seal, ok := c.seal.(*autoSeal); ok {
seal.StopHealthCheck()
}

preSealPhysical(c)

c.logger.Info("pre-seal teardown complete")
Expand Down
97 changes: 96 additions & 1 deletion vault/seal_autoseal.go
Original file line number Diff line number Diff line change
@@ -1,11 +1,15 @@
package vault

import (
"bytes"
"context"
"crypto/subtle"
"encoding/json"
"fmt"
mathrand "math/rand"
"sync"
"sync/atomic"
"time"

proto "github.com/golang/protobuf/proto"
log "github.com/hashicorp/go-hclog"
Expand All @@ -16,7 +20,14 @@ import (

// barrierTypeUpgradeCheck checks for backwards compat on barrier type, not
// applicable in the OSS side
var barrierTypeUpgradeCheck = func(_ string, _ *SealConfig) {}
var (
barrierTypeUpgradeCheck = func(_ string, _ *SealConfig) {}
autoSealUnavailableDuration = []string{"seal", "unreachable", "time"}
// vars for unit testings
sealHealthTestIntervalNominal = 10 * time.Minute
sealHealthTestIntervalUnhealthy = 1 * time.Minute
sealHealthTestTimeout = 1 * time.Minute
)

// autoSeal is a Seal implementation that contains logic for encrypting and
// decrypting stored keys via an underlying AutoSealAccess implementation, as
Expand All @@ -28,6 +39,9 @@ type autoSeal struct {
recoveryConfig atomic.Value
core *Core
logger log.Logger

hcLock sync.Mutex
healthCheckStop chan struct{}
}

// Ensure we are implementing the Seal interface
Expand Down Expand Up @@ -499,3 +513,84 @@ func (d *autoSeal) migrateRecoveryConfig(ctx context.Context) error {

return nil
}

// StartHealthCheck starts a goroutine that tests the health of the auto-unseal backend once every 10 minutes.
// If unhealthy, logs a warning on the condition and begins testing every one minute until healthy again.
func (d *autoSeal) StartHealthCheck() {
d.StopHealthCheck()
d.hcLock.Lock()
defer d.hcLock.Unlock()

healthCheck := time.NewTicker(sealHealthTestIntervalNominal)
d.healthCheckStop = make(chan struct{})
healthCheckStop := d.healthCheckStop

go func() {
ctx := d.core.activeContext
lastTestOk := true
lastSeenOk := time.Now()
for {
select {
case <-healthCheckStop:
if healthCheck != nil {
healthCheck.Stop()
}
healthCheckStop = nil
return
case t := <-healthCheck.C:
func() {
ctx, cancel := context.WithTimeout(ctx, sealHealthTestTimeout)
defer cancel()

testVal := fmt.Sprintf("Heartbeat %d", mathrand.Intn(1000))
ciphertext, err := d.Access.Encrypt(ctx, []byte(testVal), nil)

if err != nil {
d.logger.Warn("failed to encrypt seal health test value, seal backend may be unreachable", "error", err)
if lastTestOk {
healthCheck.Reset(sealHealthTestIntervalUnhealthy)
}
lastTestOk = false
d.core.MetricSink().SetGauge(autoSealUnavailableDuration, 0)
} else {
plaintext, err := d.Access.Decrypt(ctx, ciphertext, nil)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same timeout (shared) across both Encrypt/Decrypt, or should we have a 1m timeout for each?

if err != nil {
d.logger.Warn("failed to decrypt seal health test value, seal backend may be unreachable", "error", err)
if lastTestOk {
healthCheck.Reset(sealHealthTestIntervalUnhealthy)
}
lastTestOk = false
}
if !bytes.Equal([]byte(testVal), plaintext) {
d.logger.Warn("seal health test value failed to decrypt to expected value")
if lastTestOk {
healthCheck.Reset(sealHealthTestIntervalUnhealthy)
}
lastTestOk = false
} else {
d.logger.Debug("seal health test passed")
if !lastTestOk {
d.logger.Info("seal backend is now healthy again", "downtime", t.Sub(lastSeenOk).String())
}
if !lastTestOk {
healthCheck.Reset(sealHealthTestIntervalNominal)
}
lastTestOk = true
lastSeenOk = t
}
d.core.MetricSink().SetGauge(autoSealUnavailableDuration, float32(time.Since(lastSeenOk).Nanoseconds()))
}
}()
}
}
}()
}

func (d *autoSeal) StopHealthCheck() {
d.hcLock.Lock()
defer d.hcLock.Unlock()
if d.healthCheckStop != nil {
close(d.healthCheckStop)
d.healthCheckStop = nil
}
}
70 changes: 70 additions & 0 deletions vault/seal_autoseal_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,13 @@ package vault
import (
"bytes"
"context"
"errors"
"github.com/armon/go-metrics"
"github.com/hashicorp/vault/helper/metricsutil"
"reflect"
"strings"
"testing"
"time"

proto "github.com/golang/protobuf/proto"
wrapping "github.com/hashicorp/go-kms-wrapping"
Expand Down Expand Up @@ -157,3 +162,68 @@ func TestAutoSeal_UpgradeKeys(t *testing.T) {
}
check()
}

func TestAutoSeal_HealthCheck(t *testing.T) {
inmemSink := metrics.NewInmemSink(
1000000*time.Hour,
2000000*time.Hour)

metricsConf := metrics.DefaultConfig("")
metricsConf.EnableHostname = false
metricsConf.EnableHostnameLabel = false
metricsConf.EnableServiceLabel = false
metricsConf.EnableTypePrefix = false

metrics.NewGlobal(metricsConf, inmemSink)

core, _, _ := TestCoreUnsealed(t)
testSeal, testErr := seal.NewToggleableTestSeal(nil)

var encKeys []string
changeKey := func(key string) {
encKeys = append(encKeys, key)
testSeal.Wrapper.(*seal.ToggleableWrapper).Wrapper.(*wrapping.TestWrapper).SetKeyID(key)
}

// Set initial encryption key.
changeKey("kaz")

autoSeal := NewAutoSeal(testSeal)
autoSeal.SetCore(core)
pBackend := newTestBackend(t)
core.physical = pBackend
core.metricSink = metricsutil.NewClusterMetricSink("", inmemSink)

sealHealthTestIntervalNominal = 10 * time.Millisecond
sealHealthTestIntervalUnhealthy = 10 * time.Millisecond
autoSeal.StartHealthCheck()
*testErr = errors.New("disconnected")

time.Sleep(50 * time.Millisecond)

asu := strings.Join(autoSealUnavailableDuration, ".") + ";cluster="
intervals := inmemSink.Data()
if len(intervals) == 1 {
interval := inmemSink.Data()[0]

if _, ok := interval.Gauges[asu]; !ok {
t.Fatalf("Expected metrics to include a value for gauge %s", asu)
}
if interval.Gauges[asu].Value != 0 {
t.Fatalf("Expected value metric %s to be non-zero", asu)
}
}
*testErr = nil
time.Sleep(50 * time.Millisecond)
intervals = inmemSink.Data()
if len(intervals) == 1 {
interval := inmemSink.Data()[0]

if _, ok := interval.Gauges[asu]; !ok {
t.Fatalf("Expected metrics to include a value for gauge %s", asu)
}
if interval.Gauges[asu].Value == 0 {
t.Fatalf("Expected value metric %s to be non-zero", asu)
}
}
}