-
Notifications
You must be signed in to change notification settings - Fork 4.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add a periodic test of the autoseal to detect loss of connectivity. #13078
Changes from 5 commits
0c77bba
f07a457
99f21d2
260a539
b8121d9
f7c4a2d
6fd85fb
10517d1
4899719
7f5e2bb
674c5f5
29ee1a9
7ac2cfa
649dd0a
5e9fa94
4e6b5a3
bb6fa2e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
```release-note:improvement | ||
core: Periodically test the health of connectivity to auto-seal backends | ||
``` |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,11 +1,14 @@ | ||
package vault | ||
|
||
import ( | ||
"bytes" | ||
"context" | ||
"crypto/subtle" | ||
"encoding/json" | ||
"fmt" | ||
mathrand "math/rand" | ||
"sync/atomic" | ||
"time" | ||
|
||
proto "github.com/golang/protobuf/proto" | ||
log "github.com/hashicorp/go-hclog" | ||
|
@@ -18,16 +21,20 @@ import ( | |
// applicable in the OSS side | ||
var barrierTypeUpgradeCheck = func(_ string, _ *SealConfig) {} | ||
|
||
const sealHeathTestInterval = 1 * time.Minute | ||
|
||
// autoSeal is a Seal implementation that contains logic for encrypting and | ||
// decrypting stored keys via an underlying AutoSealAccess implementation, as | ||
// well as logic related to recovery keys and barrier config. | ||
type autoSeal struct { | ||
*seal.Access | ||
|
||
barrierConfig atomic.Value | ||
recoveryConfig atomic.Value | ||
core *Core | ||
logger log.Logger | ||
barrierConfig atomic.Value | ||
recoveryConfig atomic.Value | ||
core *Core | ||
logger log.Logger | ||
healthCheck *time.Ticker | ||
ncabatoff marked this conversation as resolved.
Show resolved
Hide resolved
|
||
healthCheckStop chan struct{} | ||
} | ||
|
||
// Ensure we are implementing the Seal interface | ||
|
@@ -499,3 +506,52 @@ func (d *autoSeal) migrateRecoveryConfig(ctx context.Context) error { | |
|
||
return nil | ||
} | ||
|
||
func (d *autoSeal) StartHealthCheck() { | ||
d.healthCheck = time.NewTicker(sealHeathTestInterval) | ||
d.healthCheckStop = make(chan struct{}) | ||
ncabatoff marked this conversation as resolved.
Show resolved
Hide resolved
|
||
go func() { | ||
lastTestOk := true | ||
ncabatoff marked this conversation as resolved.
Show resolved
Hide resolved
|
||
lastSeenOk := time.Now() | ||
for { | ||
select { | ||
case <-d.healthCheckStop: | ||
d.healthCheck.Stop() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm probably being paranoid here but if we get multiple healthCheckStop we will get a nil deference error. Is it worth adding a quick test and return if d.healthCheck is nil? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nvm the case statement will never fire. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes it is, thanks. |
||
close(d.healthCheckStop) | ||
d.healthCheck = nil | ||
d.healthCheckStop = nil | ||
return | ||
case t := <-d.healthCheck.C: | ||
testVal := fmt.Sprintf("Heartbeat %d", mathrand.Intn(1000)) | ||
ciphertext, err := d.Wrapper.Encrypt(d.core.activeContext, []byte(testVal), nil) | ||
ncabatoff marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if err != nil { | ||
lastTestOk = false | ||
d.logger.Warn("failed to encrypt seal health test value, seal backend may be unreachable", "error", err) | ||
} else { | ||
plaintext, err := d.Wrapper.Decrypt(d.core.activeContext, ciphertext, nil) | ||
if err != nil { | ||
lastTestOk = false | ||
d.logger.Warn("failed to decrypt seal health test value, seal backend may be unreachable", "error", err) | ||
} | ||
if !bytes.Equal([]byte(testVal), plaintext) { | ||
lastTestOk = false | ||
d.logger.Warn("seal health test value failed to decrypt to expected value") | ||
} else { | ||
d.logger.Debug("seal health test passed") | ||
if !lastTestOk { | ||
d.logger.Info("seal backend is now healthy again", "downtime", t.Sub(lastSeenOk).String()) | ||
} | ||
lastTestOk = true | ||
lastSeenOk = t | ||
} | ||
} | ||
} | ||
} | ||
}() | ||
} | ||
|
||
func (d *autoSeal) StopHealthCheck() { | ||
if d.healthCheckStop != nil { | ||
d.healthCheckStop <- struct{}{} | ||
ncabatoff marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Didn't we want to check this hourly?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, I'm back and forth on that. Since it's a cheap encrypt/decrypt I coded it to be more frequent. Open to thoughts.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm assuming this is being done on all auto-unseal implementations, not just pkcs11 implementations. If that is the case could the costs associated with using the various KMS providers within the cloud start adding up? I don't have a good sense of those costs across the various implementations.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sealHeathTestInterval
- should this be sealHealthTestInterval?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It sure should.