diff --git a/lib/auth/init.go b/lib/auth/init.go index 67b549f52ef71..27090c2141c8d 100644 --- a/lib/auth/init.go +++ b/lib/auth/init.go @@ -571,7 +571,7 @@ func initCluster(ctx context.Context, cfg InitConfig, asrv *Server) error { // Key deletion is best-effort, log a warning if it fails and carry on. // We don't want to prevent a CA rotation, which may be necessary in // some cases where this would fail. - log.WithError(err).Warning("Failed attempt to delete unused HSM keys") + log.Warnf("An attempt to clean up unused HSM or KMS CA keys has failed unexpectedly: %v", err) } if lib.IsInsecureDevMode() { diff --git a/lib/auth/keystore/gcp_kms.go b/lib/auth/keystore/gcp_kms.go index 08485472086ee..8777ac0dee455 100644 --- a/lib/auth/keystore/gcp_kms.go +++ b/lib/auth/keystore/gcp_kms.go @@ -228,65 +228,99 @@ func (g *gcpKMSKeyStore) canSignWithKey(ctx context.Context, raw []byte, keyType return true, nil } -// DeleteUnusedKeys deletes all keys from KMS if they are: -// 1. Labeled by this server (matching HostUUID) when they were created -// 2. Not included in the argument activeKeys +// DeleteUnusedKeys deletes all keys from the configured KMS keyring if they: +// 1. Are not included in the argument activeKeys +// 2. Are labeled with hostLabel (teleport_auth_host) +// 3. The hostLabel value matches the local host UUID +// +// The activeKeys argument is meant to contain to complete set of raw key IDs as +// stored in the current CA specs in the backend. +// +// The reason this does not delete any keys created by a different auth server +// is to avoid a case where: +// 1. A different auth server (auth2) creates a new key in GCP KMS +// 2. This function (running on auth1) deletes that new key +// 3. auth2 saves the id of this deleted key to the backend CA +// +// or a simpler case where: the other auth server is running in a completely +// different Teleport cluster and the keys it's actively using will never appear +// in the activeKeys argument. func (g *gcpKMSKeyStore) DeleteUnusedKeys(ctx context.Context, activeKeys [][]byte) error { // Make a map of currently active key versions, this is used for lookups to - // check which keys in KMS are unused, and holds a count of how many times - // they are found in KMS. If any active keys are not found in KMS, we are in - // a bad state, so key deletion will be aborted. + // check which keys in KMS are unused. activeKmsKeyVersions := make(map[string]int) for _, activeKey := range activeKeys { - keyID, err := parseGCPKMSKeyID(activeKey) + keyIsRelevant, err := g.canSignWithKey(ctx, activeKey, keyType(activeKey)) if err != nil { - // could be a different type of key + // Don't expect this error to ever hit, safer to return if it does. + return trace.Wrap(err) + } + if !keyIsRelevant { + // Ignore active keys that are not GCP KMS keys or are in a + // different keyring than the one this Auth is configured to use. continue } + keyID, err := parseGCPKMSKeyID(activeKey) + if err != nil { + // Realistically we should not hit this since canSignWithKey already + // calls parseGCPKMSKeyID. + return trace.Wrap(err) + } activeKmsKeyVersions[keyID.keyVersionName] = 0 } - var unusedKeyIDs []gcpKMSKeyID + var keysToDelete []gcpKMSKeyID listKeyRequest := &kmspb.ListCryptoKeysRequest{ Parent: g.keyRing, - Filter: fmt.Sprintf("labels.%s=%s", hostLabel, g.hostUUID), + // Only bother listing keys created by Teleport which should have the + // hostLabel set. A filter of "labels.label:*" tests if the label is + // defined. + // https://cloud.google.com/sdk/gcloud/reference/topic/filters + // > Use key:* to test if key is defined + Filter: fmt.Sprintf("labels.%s:*", hostLabel), } iter := g.kmsClient.ListCryptoKeys(ctx, listKeyRequest) key, err := iter.Next() for err == nil { keyVersionName := key.Name + keyVersionSuffix if _, active := activeKmsKeyVersions[keyVersionName]; active { - activeKmsKeyVersions[keyVersionName]++ - } else { - unusedKeyIDs = append(unusedKeyIDs, gcpKMSKeyID{ + // Record that this current active key was actually found. + activeKmsKeyVersions[keyVersionName] += 1 + } else if key.Labels[hostLabel] == g.hostUUID { + // This key is not active (it is not currently stored in any + // Teleport CA) and it was created by this Auth server, so it should + // be safe to delete. + keysToDelete = append(keysToDelete, gcpKMSKeyID{ keyVersionName: keyVersionName, }) } key, err = iter.Next() } if err != nil && !errors.Is(err, iterator.Done) { - return trace.Wrap(err) + return trace.Wrap(err, "unexpected error while iterating GCP KMS keys") } + // If any member of activeKeys which is part of the same GCP KMS keyring + // queried here was not found in the ListCryptoKeys response, something has + // gone wrong and there's a chance we have a bug or GCP has made a breaking + // API change. In this case we should abort to avoid the chance of deleting + // any currently active keys. for keyVersion, found := range activeKmsKeyVersions { if found == 0 { - // Failed to find a currently active key owned by this host. - // The cluster is in a bad state, refuse to delete any keys. return trace.NotFound( - "cannot find currently active CA key in %q GCP KMS, aborting attempt to delete unused keys", + "cannot find currently active CA key %q in GCP KMS, aborting attempt to delete unused keys", keyVersion) } } - for _, unusedKey := range unusedKeyIDs { - g.log.WithFields(logrus.Fields{"key_version": unusedKey.keyVersionName}).Info("deleting unused GCP KMS key created by this server") + for _, unusedKey := range keysToDelete { + g.log.WithFields(logrus.Fields{"key_version": unusedKey.keyVersionName}).Info("Deleting unused GCP KMS key.") err := g.deleteKey(ctx, unusedKey.marshal()) // Ignore errors where we can't destroy because the state is already // DESTROYED or DESTROY_SCHEDULED if err != nil && !strings.Contains(err.Error(), "has value DESTROY") { - g.log.WithFields(logrus.Fields{"key_version": unusedKey.keyVersionName}).WithError(err).Warn("error deleting unused GCP KMS key") - return trace.Wrap(err) + return trace.Wrap(err, "error deleting unused GCP KMS key %q", unusedKey.keyVersionName) } } return nil diff --git a/lib/auth/keystore/gcp_kms_test.go b/lib/auth/keystore/gcp_kms_test.go index 92799e89fb220..4cc002e965a0f 100644 --- a/lib/auth/keystore/gcp_kms_test.go +++ b/lib/auth/keystore/gcp_kms_test.go @@ -31,6 +31,7 @@ import ( kms "cloud.google.com/go/kms/apiv1" "cloud.google.com/go/kms/apiv1/kmspb" "github.com/gravitational/trace" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "golang.org/x/crypto/ssh" "google.golang.org/api/option" @@ -226,15 +227,15 @@ func (f *fakeGCPKMSServer) AsymmetricSign(ctx context.Context, req *kmspb.Asymme return resp, nil } -func (f *fakeGCPKMSServer) ListCryptoKeys(context.Context, *kmspb.ListCryptoKeysRequest) (*kmspb.ListCryptoKeysResponse, error) { +func (f *fakeGCPKMSServer) ListCryptoKeys(ctx context.Context, req *kmspb.ListCryptoKeysRequest) (*kmspb.ListCryptoKeysResponse, error) { f.mu.RLock() defer f.mu.RUnlock() var cryptoKeys []*kmspb.CryptoKey - for keyVersionName := range f.keyVersions { - cryptoKey := &kmspb.CryptoKey{ - Name: strings.TrimSuffix(keyVersionName, "/cryptoKeyVersions/1"), + for keyVersionName, keyState := range f.keyVersions { + if !strings.HasPrefix(keyVersionName, req.Parent) { + continue } - cryptoKeys = append(cryptoKeys, cryptoKey) + cryptoKeys = append(cryptoKeys, keyState.cryptoKey) } resp := &kmspb.ListCryptoKeysResponse{ CryptoKeys: cryptoKeys, @@ -252,12 +253,7 @@ func (f *fakeGCPKMSServer) DestroyCryptoKeyVersion(ctx context.Context, req *kms } keyState.cryptoKeyVersion.State = kmspb.CryptoKeyVersion_DESTROY_SCHEDULED - - resp := &kmspb.CryptoKeyVersion{ - Name: req.Name, - State: keyState.cryptoKeyVersion.State, - } - return resp, nil + return keyState.cryptoKeyVersion, nil } // deleteKey is a test helper to delete a key by the raw ID which would be @@ -593,3 +589,157 @@ func TestGCPKMSKeystore(t *testing.T) { }) } } + +func TestGCPKMSDeleteUnusedKeys(t *testing.T) { + t.Parallel() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + const ( + localHostID = "local-host-id" + otherHostID = "other-host-id" + localKeyring = "local-keyring" + otherKeyring = "other-keyring" + ) + + for _, tc := range []struct { + desc string + existingKeys []keySpec + activeKeys []keySpec + expectDestroyed []keySpec + }{ + { + // Only inactive keys should be destroyed. + desc: "active and inactive", + existingKeys: []keySpec{ + {keyring: localKeyring, id: "id_active", host: localHostID}, + {keyring: localKeyring, id: "id_inactive", host: localHostID}, + }, + activeKeys: []keySpec{ + {keyring: localKeyring, id: "id_active", host: localHostID}, + }, + expectDestroyed: []keySpec{ + {keyring: localKeyring, id: "id_inactive", host: localHostID}, + }, + }, + { + // Inactive key from other host should not be destroyed, it may + // be recently created and just not added to Teleport CA yet, or the + // other Auth might be in a completely different Teleport cluster + // using the same keyring (I wouldn't advise this but someone might + // do it). + desc: "inactive key from other host", + existingKeys: []keySpec{ + {keyring: localKeyring, id: "id_inactive_local", host: localHostID}, + {keyring: localKeyring, id: "id_inactive_remote", host: otherHostID}, + }, + expectDestroyed: []keySpec{ + {keyring: localKeyring, id: "id_inactive_local", host: localHostID}, + }, + }, + { + // The presence of active keys created by a remote host in the local + // keyring should not break the DeleteUnusedKeys operation. + desc: "active key from other host", + existingKeys: []keySpec{ + {keyring: localKeyring, id: "id_active_local", host: localHostID}, + {keyring: localKeyring, id: "id_inactive_local", host: localHostID}, + {keyring: localKeyring, id: "id_active_remote", host: otherHostID}, + {keyring: localKeyring, id: "id_inactive_remote", host: otherHostID}, + }, + activeKeys: []keySpec{ + {keyring: localKeyring, id: "id_active_local", host: localHostID}, + {keyring: localKeyring, id: "id_active_remote", host: otherHostID}, + }, + expectDestroyed: []keySpec{ + {keyring: localKeyring, id: "id_inactive_local", host: localHostID}, + }, + }, + { + // Keys in other keyring should never be destroyed. + desc: "keys in other keyring", + existingKeys: []keySpec{ + {keyring: localKeyring, id: "id_inactive_local", host: localHostID}, + {keyring: otherKeyring, id: "id_inactive_other1", host: localHostID}, + {keyring: otherKeyring, id: "id_inactive_other2", host: otherHostID}, + }, + expectDestroyed: []keySpec{ + {keyring: localKeyring, id: "id_inactive_local", host: localHostID}, + }, + }, + } { + t.Run(tc.desc, func(t *testing.T) { + fakeKMSServer, dialer := newTestGCPKMSService(t) + kmsClient := newTestGCPKMSClient(t, dialer) + manager, err := NewManager(ctx, Config{ + GCPKMS: GCPKMSConfig{ + ProtectionLevel: "HSM", + HostUUID: localHostID, + KeyRing: localKeyring, + kmsClientOverride: kmsClient, + }, + }) + require.NoError(t, err, "error while creating test keystore manager") + + // Pre-req: create existing keys in fake KMS backend + for _, ks := range tc.existingKeys { + _, err := fakeKMSServer.CreateCryptoKey(ctx, createKeyRequest(ks)) + require.NoError(t, err) + } + + // Test: DeleteUnusedKeys with activeKeys from the testcase + activeKeyIDs := make([][]byte, len(tc.activeKeys)) + for i, ks := range tc.activeKeys { + activeKeyIDs[i] = ks.keyID() + } + err = manager.DeleteUnusedKeys(ctx, activeKeyIDs) + assert.NoError(t, err) + + expectDestroyedSet := make(map[string]bool, len(tc.expectDestroyed)) + for _, ks := range tc.expectDestroyed { + expectDestroyedSet[ks.keyVersionName()] = true + } + require.Len(t, fakeKMSServer.keyVersions, len(tc.existingKeys)) + for keyVersionName, state := range fakeKMSServer.keyVersions { + if expectDestroyedSet[keyVersionName] { + // Fake KMS server only sets state to DESTROY_SCHEDULED, + // that's good enough for the test. + require.Equal(t, kmspb.CryptoKeyVersion_DESTROY_SCHEDULED.String(), state.cryptoKeyVersion.State.String()) + } else { + require.Equal(t, kmspb.CryptoKeyVersion_ENABLED.String(), state.cryptoKeyVersion.State.String()) + } + } + }) + } +} + +type keySpec struct { + keyring, id, host string +} + +func (ks *keySpec) keyVersionName() string { + return ks.keyring + "/cryptoKeys/" + ks.id + keyVersionSuffix +} + +func (ks *keySpec) keyID() []byte { + return gcpKMSKeyID{ + keyVersionName: ks.keyVersionName(), + }.marshal() +} + +func createKeyRequest(ks keySpec) *kmspb.CreateCryptoKeyRequest { + return &kmspb.CreateCryptoKeyRequest{ + Parent: ks.keyring, + CryptoKeyId: ks.id, + CryptoKey: &kmspb.CryptoKey{ + Purpose: kmspb.CryptoKey_ASYMMETRIC_SIGN, + Labels: map[string]string{ + hostLabel: ks.host, + }, + VersionTemplate: &kmspb.CryptoKeyVersionTemplate{ + ProtectionLevel: kmspb.ProtectionLevel_SOFTWARE, + Algorithm: kmspb.CryptoKeyVersion_RSA_SIGN_PKCS1_2048_SHA256, + }, + }, + } +} diff --git a/lib/auth/keystore/keystore_test.go b/lib/auth/keystore/keystore_test.go index 6256511abc954..d99caebc9e2d6 100644 --- a/lib/auth/keystore/keystore_test.go +++ b/lib/auth/keystore/keystore_test.go @@ -163,11 +163,13 @@ func TestKeyStore(t *testing.T) { yubiSlotNumber := 0 backends := []struct { - desc string - config Config - isSoftware bool - shouldSkip func() bool - fakeKeyHack func([]byte) []byte + desc string + config Config + isSoftware bool + shouldSkip func() bool + // unusedRawKey should return passable raw key identifier for this + // backend that would not actually exist in the backend. + unusedRawKey func(t *testing.T) []byte }{ { desc: "software", @@ -178,6 +180,11 @@ func TestKeyStore(t *testing.T) { }, isSoftware: true, shouldSkip: func() bool { return false }, + unusedRawKey: func(t *testing.T) []byte { + rawKey, _, err := native.GenerateKeyPair() + require.NoError(t, err) + return rawKey + }, }, { desc: "softhsm", @@ -189,6 +196,14 @@ func TestKeyStore(t *testing.T) { } return false }, + unusedRawKey: func(t *testing.T) []byte { + rawKey, err := keyID{ + HostID: softHSMConfig.PKCS11.HostUUID, + KeyID: "FFFFFFFF-FFFF-FFFF-FFFF-FFFFFFFFFFFF", + }.marshal() + require.NoError(t, err) + return rawKey + }, }, { desc: "yubihsm", @@ -207,6 +222,14 @@ func TestKeyStore(t *testing.T) { } return false }, + unusedRawKey: func(t *testing.T) []byte { + rawKey, err := keyID{ + HostID: hostUUID, + KeyID: "FFFFFFFF-FFFF-FFFF-FFFF-FFFFFFFFFFFF", + }.marshal() + require.NoError(t, err) + return rawKey + }, }, { desc: "cloudhsm", @@ -225,6 +248,14 @@ func TestKeyStore(t *testing.T) { } return false }, + unusedRawKey: func(t *testing.T) []byte { + rawKey, err := keyID{ + HostID: hostUUID, + KeyID: "FFFFFFFF-FFFF-FFFF-FFFF-FFFFFFFFFFFF", + }.marshal() + require.NoError(t, err) + return rawKey + }, }, { desc: "gcp kms", @@ -234,14 +265,10 @@ func TestKeyStore(t *testing.T) { shouldSkip: func() bool { return false }, - fakeKeyHack: func(key []byte) []byte { - // GCP KMS keys are never really deleted, their state is just - // set to destroyed, so this hack modifies a key to make it - // unrecognizable - kmsKey, err := parseGCPKMSKeyID(key) - require.NoError(t, err) - kmsKey.keyVersionName += "fake" - return kmsKey.marshal() + unusedRawKey: func(t *testing.T) []byte { + return gcpKMSKeyID{ + keyVersionName: gcpKMSConfig.KeyRing + "/cryptoKeys/FFFFFFFF-FFFF-FFFF-FFFF-FFFFFFFFFFFF" + keyVersionSuffix, + }.marshal() }, }, } @@ -431,13 +458,9 @@ func TestKeyStore(t *testing.T) { } // Make sure key deletion is aborted when one of the active keys - // cannot be found. - // Use rawKeys[1] as a fake active key, it was just deleted in the - // previous step. - fakeActiveKey := rawKeys[1] - if tc.fakeKeyHack != nil { - fakeActiveKey = tc.fakeKeyHack(fakeActiveKey) - } + // cannot be found. This makes sure that we don't accidentally + // delete current active keys in case the ListKeys operation fails. + fakeActiveKey := tc.unusedRawKey(t) err = keyStore.DeleteUnusedKeys(ctx, [][]byte{fakeActiveKey}) require.True(t, trace.IsNotFound(err), "expected NotFound error, got %v", err)