From 69101faaa8e49aea4d0028156de699497854f3d6 Mon Sep 17 00:00:00 2001 From: Nic Klaassen Date: Mon, 30 Oct 2023 14:54:12 -0700 Subject: [PATCH] Fix cleanup of unused GCP KMS keys This PR fixes the cleanup code for unused keys in GCP KMS which is currently failing to delete unused keys and emitting confusing warning logs whenever keys have been generated by multiple different Auth servers (no actual functionality is currently broken). This bug was introduced in https://github.com/gravitational/teleport/pull/25025. This issue introduced there is that the function now checks that all currently active keys have actually been found in the keyring, but the ListCryptoKeys call used a filter that excluded all keys created by different Auth servers. This fix improves some of the error messages, and also uses a more permissive filter in the ListCryptoKeys call to make sure we can list keys created by any auth server, but will only delete keys created by the local auth server. changelog: Fix cleanup of unused GCP KMS keys --- lib/auth/init.go | 2 +- lib/auth/keystore/gcp_kms.go | 76 +++++++++---- lib/auth/keystore/gcp_kms_test.go | 172 +++++++++++++++++++++++++++-- lib/auth/keystore/keystore_test.go | 63 +++++++---- 4 files changed, 260 insertions(+), 53 deletions(-) diff --git a/lib/auth/init.go b/lib/auth/init.go index e1bb227f909cf..f7d6fb4bbdbec 100644 --- a/lib/auth/init.go +++ b/lib/auth/init.go @@ -561,7 +561,7 @@ func initCluster(ctx context.Context, cfg InitConfig, asrv *Server) error { // Key deletion is best-effort, log a warning if it fails and carry on. // We don't want to prevent a CA rotation, which may be necessary in // some cases where this would fail. - log.WithError(err).Warning("Failed attempt to delete unused HSM keys") + log.Warnf("An attempt to clean up unused HSM or KMS CA keys has failed unexpectedly: %v", err) } if lib.IsInsecureDevMode() { diff --git a/lib/auth/keystore/gcp_kms.go b/lib/auth/keystore/gcp_kms.go index 08485472086ee..8777ac0dee455 100644 --- a/lib/auth/keystore/gcp_kms.go +++ b/lib/auth/keystore/gcp_kms.go @@ -228,65 +228,99 @@ func (g *gcpKMSKeyStore) canSignWithKey(ctx context.Context, raw []byte, keyType return true, nil } -// DeleteUnusedKeys deletes all keys from KMS if they are: -// 1. Labeled by this server (matching HostUUID) when they were created -// 2. Not included in the argument activeKeys +// DeleteUnusedKeys deletes all keys from the configured KMS keyring if they: +// 1. Are not included in the argument activeKeys +// 2. Are labeled with hostLabel (teleport_auth_host) +// 3. The hostLabel value matches the local host UUID +// +// The activeKeys argument is meant to contain to complete set of raw key IDs as +// stored in the current CA specs in the backend. +// +// The reason this does not delete any keys created by a different auth server +// is to avoid a case where: +// 1. A different auth server (auth2) creates a new key in GCP KMS +// 2. This function (running on auth1) deletes that new key +// 3. auth2 saves the id of this deleted key to the backend CA +// +// or a simpler case where: the other auth server is running in a completely +// different Teleport cluster and the keys it's actively using will never appear +// in the activeKeys argument. func (g *gcpKMSKeyStore) DeleteUnusedKeys(ctx context.Context, activeKeys [][]byte) error { // Make a map of currently active key versions, this is used for lookups to - // check which keys in KMS are unused, and holds a count of how many times - // they are found in KMS. If any active keys are not found in KMS, we are in - // a bad state, so key deletion will be aborted. + // check which keys in KMS are unused. activeKmsKeyVersions := make(map[string]int) for _, activeKey := range activeKeys { - keyID, err := parseGCPKMSKeyID(activeKey) + keyIsRelevant, err := g.canSignWithKey(ctx, activeKey, keyType(activeKey)) if err != nil { - // could be a different type of key + // Don't expect this error to ever hit, safer to return if it does. + return trace.Wrap(err) + } + if !keyIsRelevant { + // Ignore active keys that are not GCP KMS keys or are in a + // different keyring than the one this Auth is configured to use. continue } + keyID, err := parseGCPKMSKeyID(activeKey) + if err != nil { + // Realistically we should not hit this since canSignWithKey already + // calls parseGCPKMSKeyID. + return trace.Wrap(err) + } activeKmsKeyVersions[keyID.keyVersionName] = 0 } - var unusedKeyIDs []gcpKMSKeyID + var keysToDelete []gcpKMSKeyID listKeyRequest := &kmspb.ListCryptoKeysRequest{ Parent: g.keyRing, - Filter: fmt.Sprintf("labels.%s=%s", hostLabel, g.hostUUID), + // Only bother listing keys created by Teleport which should have the + // hostLabel set. A filter of "labels.label:*" tests if the label is + // defined. + // https://cloud.google.com/sdk/gcloud/reference/topic/filters + // > Use key:* to test if key is defined + Filter: fmt.Sprintf("labels.%s:*", hostLabel), } iter := g.kmsClient.ListCryptoKeys(ctx, listKeyRequest) key, err := iter.Next() for err == nil { keyVersionName := key.Name + keyVersionSuffix if _, active := activeKmsKeyVersions[keyVersionName]; active { - activeKmsKeyVersions[keyVersionName]++ - } else { - unusedKeyIDs = append(unusedKeyIDs, gcpKMSKeyID{ + // Record that this current active key was actually found. + activeKmsKeyVersions[keyVersionName] += 1 + } else if key.Labels[hostLabel] == g.hostUUID { + // This key is not active (it is not currently stored in any + // Teleport CA) and it was created by this Auth server, so it should + // be safe to delete. + keysToDelete = append(keysToDelete, gcpKMSKeyID{ keyVersionName: keyVersionName, }) } key, err = iter.Next() } if err != nil && !errors.Is(err, iterator.Done) { - return trace.Wrap(err) + return trace.Wrap(err, "unexpected error while iterating GCP KMS keys") } + // If any member of activeKeys which is part of the same GCP KMS keyring + // queried here was not found in the ListCryptoKeys response, something has + // gone wrong and there's a chance we have a bug or GCP has made a breaking + // API change. In this case we should abort to avoid the chance of deleting + // any currently active keys. for keyVersion, found := range activeKmsKeyVersions { if found == 0 { - // Failed to find a currently active key owned by this host. - // The cluster is in a bad state, refuse to delete any keys. return trace.NotFound( - "cannot find currently active CA key in %q GCP KMS, aborting attempt to delete unused keys", + "cannot find currently active CA key %q in GCP KMS, aborting attempt to delete unused keys", keyVersion) } } - for _, unusedKey := range unusedKeyIDs { - g.log.WithFields(logrus.Fields{"key_version": unusedKey.keyVersionName}).Info("deleting unused GCP KMS key created by this server") + for _, unusedKey := range keysToDelete { + g.log.WithFields(logrus.Fields{"key_version": unusedKey.keyVersionName}).Info("Deleting unused GCP KMS key.") err := g.deleteKey(ctx, unusedKey.marshal()) // Ignore errors where we can't destroy because the state is already // DESTROYED or DESTROY_SCHEDULED if err != nil && !strings.Contains(err.Error(), "has value DESTROY") { - g.log.WithFields(logrus.Fields{"key_version": unusedKey.keyVersionName}).WithError(err).Warn("error deleting unused GCP KMS key") - return trace.Wrap(err) + return trace.Wrap(err, "error deleting unused GCP KMS key %q", unusedKey.keyVersionName) } } return nil diff --git a/lib/auth/keystore/gcp_kms_test.go b/lib/auth/keystore/gcp_kms_test.go index 92799e89fb220..4cc002e965a0f 100644 --- a/lib/auth/keystore/gcp_kms_test.go +++ b/lib/auth/keystore/gcp_kms_test.go @@ -31,6 +31,7 @@ import ( kms "cloud.google.com/go/kms/apiv1" "cloud.google.com/go/kms/apiv1/kmspb" "github.com/gravitational/trace" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "golang.org/x/crypto/ssh" "google.golang.org/api/option" @@ -226,15 +227,15 @@ func (f *fakeGCPKMSServer) AsymmetricSign(ctx context.Context, req *kmspb.Asymme return resp, nil } -func (f *fakeGCPKMSServer) ListCryptoKeys(context.Context, *kmspb.ListCryptoKeysRequest) (*kmspb.ListCryptoKeysResponse, error) { +func (f *fakeGCPKMSServer) ListCryptoKeys(ctx context.Context, req *kmspb.ListCryptoKeysRequest) (*kmspb.ListCryptoKeysResponse, error) { f.mu.RLock() defer f.mu.RUnlock() var cryptoKeys []*kmspb.CryptoKey - for keyVersionName := range f.keyVersions { - cryptoKey := &kmspb.CryptoKey{ - Name: strings.TrimSuffix(keyVersionName, "/cryptoKeyVersions/1"), + for keyVersionName, keyState := range f.keyVersions { + if !strings.HasPrefix(keyVersionName, req.Parent) { + continue } - cryptoKeys = append(cryptoKeys, cryptoKey) + cryptoKeys = append(cryptoKeys, keyState.cryptoKey) } resp := &kmspb.ListCryptoKeysResponse{ CryptoKeys: cryptoKeys, @@ -252,12 +253,7 @@ func (f *fakeGCPKMSServer) DestroyCryptoKeyVersion(ctx context.Context, req *kms } keyState.cryptoKeyVersion.State = kmspb.CryptoKeyVersion_DESTROY_SCHEDULED - - resp := &kmspb.CryptoKeyVersion{ - Name: req.Name, - State: keyState.cryptoKeyVersion.State, - } - return resp, nil + return keyState.cryptoKeyVersion, nil } // deleteKey is a test helper to delete a key by the raw ID which would be @@ -593,3 +589,157 @@ func TestGCPKMSKeystore(t *testing.T) { }) } } + +func TestGCPKMSDeleteUnusedKeys(t *testing.T) { + t.Parallel() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + const ( + localHostID = "local-host-id" + otherHostID = "other-host-id" + localKeyring = "local-keyring" + otherKeyring = "other-keyring" + ) + + for _, tc := range []struct { + desc string + existingKeys []keySpec + activeKeys []keySpec + expectDestroyed []keySpec + }{ + { + // Only inactive keys should be destroyed. + desc: "active and inactive", + existingKeys: []keySpec{ + {keyring: localKeyring, id: "id_active", host: localHostID}, + {keyring: localKeyring, id: "id_inactive", host: localHostID}, + }, + activeKeys: []keySpec{ + {keyring: localKeyring, id: "id_active", host: localHostID}, + }, + expectDestroyed: []keySpec{ + {keyring: localKeyring, id: "id_inactive", host: localHostID}, + }, + }, + { + // Inactive key from other host should not be destroyed, it may + // be recently created and just not added to Teleport CA yet, or the + // other Auth might be in a completely different Teleport cluster + // using the same keyring (I wouldn't advise this but someone might + // do it). + desc: "inactive key from other host", + existingKeys: []keySpec{ + {keyring: localKeyring, id: "id_inactive_local", host: localHostID}, + {keyring: localKeyring, id: "id_inactive_remote", host: otherHostID}, + }, + expectDestroyed: []keySpec{ + {keyring: localKeyring, id: "id_inactive_local", host: localHostID}, + }, + }, + { + // The presence of active keys created by a remote host in the local + // keyring should not break the DeleteUnusedKeys operation. + desc: "active key from other host", + existingKeys: []keySpec{ + {keyring: localKeyring, id: "id_active_local", host: localHostID}, + {keyring: localKeyring, id: "id_inactive_local", host: localHostID}, + {keyring: localKeyring, id: "id_active_remote", host: otherHostID}, + {keyring: localKeyring, id: "id_inactive_remote", host: otherHostID}, + }, + activeKeys: []keySpec{ + {keyring: localKeyring, id: "id_active_local", host: localHostID}, + {keyring: localKeyring, id: "id_active_remote", host: otherHostID}, + }, + expectDestroyed: []keySpec{ + {keyring: localKeyring, id: "id_inactive_local", host: localHostID}, + }, + }, + { + // Keys in other keyring should never be destroyed. + desc: "keys in other keyring", + existingKeys: []keySpec{ + {keyring: localKeyring, id: "id_inactive_local", host: localHostID}, + {keyring: otherKeyring, id: "id_inactive_other1", host: localHostID}, + {keyring: otherKeyring, id: "id_inactive_other2", host: otherHostID}, + }, + expectDestroyed: []keySpec{ + {keyring: localKeyring, id: "id_inactive_local", host: localHostID}, + }, + }, + } { + t.Run(tc.desc, func(t *testing.T) { + fakeKMSServer, dialer := newTestGCPKMSService(t) + kmsClient := newTestGCPKMSClient(t, dialer) + manager, err := NewManager(ctx, Config{ + GCPKMS: GCPKMSConfig{ + ProtectionLevel: "HSM", + HostUUID: localHostID, + KeyRing: localKeyring, + kmsClientOverride: kmsClient, + }, + }) + require.NoError(t, err, "error while creating test keystore manager") + + // Pre-req: create existing keys in fake KMS backend + for _, ks := range tc.existingKeys { + _, err := fakeKMSServer.CreateCryptoKey(ctx, createKeyRequest(ks)) + require.NoError(t, err) + } + + // Test: DeleteUnusedKeys with activeKeys from the testcase + activeKeyIDs := make([][]byte, len(tc.activeKeys)) + for i, ks := range tc.activeKeys { + activeKeyIDs[i] = ks.keyID() + } + err = manager.DeleteUnusedKeys(ctx, activeKeyIDs) + assert.NoError(t, err) + + expectDestroyedSet := make(map[string]bool, len(tc.expectDestroyed)) + for _, ks := range tc.expectDestroyed { + expectDestroyedSet[ks.keyVersionName()] = true + } + require.Len(t, fakeKMSServer.keyVersions, len(tc.existingKeys)) + for keyVersionName, state := range fakeKMSServer.keyVersions { + if expectDestroyedSet[keyVersionName] { + // Fake KMS server only sets state to DESTROY_SCHEDULED, + // that's good enough for the test. + require.Equal(t, kmspb.CryptoKeyVersion_DESTROY_SCHEDULED.String(), state.cryptoKeyVersion.State.String()) + } else { + require.Equal(t, kmspb.CryptoKeyVersion_ENABLED.String(), state.cryptoKeyVersion.State.String()) + } + } + }) + } +} + +type keySpec struct { + keyring, id, host string +} + +func (ks *keySpec) keyVersionName() string { + return ks.keyring + "/cryptoKeys/" + ks.id + keyVersionSuffix +} + +func (ks *keySpec) keyID() []byte { + return gcpKMSKeyID{ + keyVersionName: ks.keyVersionName(), + }.marshal() +} + +func createKeyRequest(ks keySpec) *kmspb.CreateCryptoKeyRequest { + return &kmspb.CreateCryptoKeyRequest{ + Parent: ks.keyring, + CryptoKeyId: ks.id, + CryptoKey: &kmspb.CryptoKey{ + Purpose: kmspb.CryptoKey_ASYMMETRIC_SIGN, + Labels: map[string]string{ + hostLabel: ks.host, + }, + VersionTemplate: &kmspb.CryptoKeyVersionTemplate{ + ProtectionLevel: kmspb.ProtectionLevel_SOFTWARE, + Algorithm: kmspb.CryptoKeyVersion_RSA_SIGN_PKCS1_2048_SHA256, + }, + }, + } +} diff --git a/lib/auth/keystore/keystore_test.go b/lib/auth/keystore/keystore_test.go index 6256511abc954..d99caebc9e2d6 100644 --- a/lib/auth/keystore/keystore_test.go +++ b/lib/auth/keystore/keystore_test.go @@ -163,11 +163,13 @@ func TestKeyStore(t *testing.T) { yubiSlotNumber := 0 backends := []struct { - desc string - config Config - isSoftware bool - shouldSkip func() bool - fakeKeyHack func([]byte) []byte + desc string + config Config + isSoftware bool + shouldSkip func() bool + // unusedRawKey should return passable raw key identifier for this + // backend that would not actually exist in the backend. + unusedRawKey func(t *testing.T) []byte }{ { desc: "software", @@ -178,6 +180,11 @@ func TestKeyStore(t *testing.T) { }, isSoftware: true, shouldSkip: func() bool { return false }, + unusedRawKey: func(t *testing.T) []byte { + rawKey, _, err := native.GenerateKeyPair() + require.NoError(t, err) + return rawKey + }, }, { desc: "softhsm", @@ -189,6 +196,14 @@ func TestKeyStore(t *testing.T) { } return false }, + unusedRawKey: func(t *testing.T) []byte { + rawKey, err := keyID{ + HostID: softHSMConfig.PKCS11.HostUUID, + KeyID: "FFFFFFFF-FFFF-FFFF-FFFF-FFFFFFFFFFFF", + }.marshal() + require.NoError(t, err) + return rawKey + }, }, { desc: "yubihsm", @@ -207,6 +222,14 @@ func TestKeyStore(t *testing.T) { } return false }, + unusedRawKey: func(t *testing.T) []byte { + rawKey, err := keyID{ + HostID: hostUUID, + KeyID: "FFFFFFFF-FFFF-FFFF-FFFF-FFFFFFFFFFFF", + }.marshal() + require.NoError(t, err) + return rawKey + }, }, { desc: "cloudhsm", @@ -225,6 +248,14 @@ func TestKeyStore(t *testing.T) { } return false }, + unusedRawKey: func(t *testing.T) []byte { + rawKey, err := keyID{ + HostID: hostUUID, + KeyID: "FFFFFFFF-FFFF-FFFF-FFFF-FFFFFFFFFFFF", + }.marshal() + require.NoError(t, err) + return rawKey + }, }, { desc: "gcp kms", @@ -234,14 +265,10 @@ func TestKeyStore(t *testing.T) { shouldSkip: func() bool { return false }, - fakeKeyHack: func(key []byte) []byte { - // GCP KMS keys are never really deleted, their state is just - // set to destroyed, so this hack modifies a key to make it - // unrecognizable - kmsKey, err := parseGCPKMSKeyID(key) - require.NoError(t, err) - kmsKey.keyVersionName += "fake" - return kmsKey.marshal() + unusedRawKey: func(t *testing.T) []byte { + return gcpKMSKeyID{ + keyVersionName: gcpKMSConfig.KeyRing + "/cryptoKeys/FFFFFFFF-FFFF-FFFF-FFFF-FFFFFFFFFFFF" + keyVersionSuffix, + }.marshal() }, }, } @@ -431,13 +458,9 @@ func TestKeyStore(t *testing.T) { } // Make sure key deletion is aborted when one of the active keys - // cannot be found. - // Use rawKeys[1] as a fake active key, it was just deleted in the - // previous step. - fakeActiveKey := rawKeys[1] - if tc.fakeKeyHack != nil { - fakeActiveKey = tc.fakeKeyHack(fakeActiveKey) - } + // cannot be found. This makes sure that we don't accidentally + // delete current active keys in case the ListKeys operation fails. + fakeActiveKey := tc.unusedRawKey(t) err = keyStore.DeleteUnusedKeys(ctx, [][]byte{fakeActiveKey}) require.True(t, trace.IsNotFound(err), "expected NotFound error, got %v", err)