diff --git a/integrations/kube-agent-updater/DEBUG.md b/integrations/kube-agent-updater/DEBUG.md new file mode 100644 index 0000000000000..2156f080f353c --- /dev/null +++ b/integrations/kube-agent-updater/DEBUG.md @@ -0,0 +1,26 @@ +## Debugging tips for the kube-agent-updater + +### Running locally the updater against a remote Kubernetes cluster + +Running locally let you attach a debugger while still working against a real +cluster. This can be used to reproduce most complex issues and troubleshoot +specific cases. + +- Validate your current context works + ```shell + kubectl cluster-info + ``` +- Open a proxy to the api-server, then let the shell open and running + ```shell + kubectl proxy + ``` +- open a new terminal, create a new temporary directory and create your new kubeconfig + ```shell + export kubeconfig="$(mktemp)" + kubectl config set-credentials myself --username=foo + kubectl config set-cluster local-server --server=http://localhost:8001 + kubectl config set-context default-context --cluster=local-server --user=myself + kubectl config use-context default-context + echo "$KUBECONFIG" + ``` +- run the controller with the `KUBECONFIG` environment variable set diff --git a/integrations/kube-agent-updater/cmd/teleport-kube-agent-updater/constants.go b/integrations/kube-agent-updater/cmd/teleport-kube-agent-updater/constants.go new file mode 100644 index 0000000000000..3015b7150ae0f --- /dev/null +++ b/integrations/kube-agent-updater/cmd/teleport-kube-agent-updater/constants.go @@ -0,0 +1,36 @@ +/* +Copyright 2023 Gravitational, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +// teleportProdOCIPubKey is the key used to sign Teleport distroless images. +// The key lives in the Teleport production AWS KMS. +// In case of controlled rotation, we will want to add a second validator with +// the new key to support the transition period. +var teleportProdOCIPubKey = []byte(`-----BEGIN PUBLIC KEY----- +MIICIjANBgkqhkiG9w0BAQEFAAOCAg8AMIICCgKCAgEAx+9UZboMl9ibwu/IWqbX ++wEJeKJqVpaLEsy1ODRpzIgcgaMh2n3BWtFEIoEszR3ZNlGdfqoPmb0nNnWx/qSf +eEsoSXievXa63M/gAUBB+jecbGEJH+SNaJPMVuvjabPqKtoMT2Spw3cacqpINzq1 +rkWU8IawY333gXbwzgsuK7izT7ymgOLPO9qPuX7Q3EBaGw3EvY7u6UKtqhvSGdyr +MirEErOERQ8EP8TrkCcJk0UfPAukzIcj91uHlXaqYBD/IyNYiC70EOlSLoN5/EeA +I4jQnGRfaKF6H6K+WieX9tP9k8/02S+1EVJW592pdQZhJZEq1B/dMc8UR3IjPMMC +qCT2xT6TsinaVzDaAbaRf0hvp311GxwrckNofGm/OSLn1+HqM6q4/A7qHubeRXGO +byabRr93CHSLegZ7OBMswHqqnu6/DuXjc6gOsQkH09dVTFeh34rQy4GKrvnpmOwj +Er1ccxzKcF/pw+lxi07hkpihR/uHUPxFboA/Wl7H2Jub21MFwIFQrDJv7z8yQgxJ +EuIXJJox2oAL7NzdSi9VIUYnEnx+2EtkU/spAFRR6i1BnT6aoIy3521B76wnmRr9 +atCSKjt6MdRxgj4htCjBWWJAGM9Z/avF4CYFmK7qiVxgpdrSM8Esbt2Ta+Lu3QMJ +T8LjqFu3u3dxVOo9RuLk+BkCAwEAAQ== +-----END PUBLIC KEY-----`) diff --git a/integrations/kube-agent-updater/cmd/teleport-kube-agent-updater/main.go b/integrations/kube-agent-updater/cmd/teleport-kube-agent-updater/main.go index 8af57348ad29c..bb312a8c62f18 100644 --- a/integrations/kube-agent-updater/cmd/teleport-kube-agent-updater/main.go +++ b/integrations/kube-agent-updater/cmd/teleport-kube-agent-updater/main.go @@ -18,19 +18,21 @@ package main import ( "flag" + "net/url" "os" + "strings" "time" "github.com/docker/distribution/reference" "github.com/gravitational/trace" appsv1 "k8s.io/api/apps/v1" + v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/fields" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/log/zap" - runtimescheme "sigs.k8s.io/controller-runtime/pkg/scheme" "github.com/gravitational/teleport/integrations/kube-agent-updater/pkg/controller" "github.com/gravitational/teleport/integrations/kube-agent-updater/pkg/img" @@ -39,18 +41,12 @@ import ( ) var ( - SchemeBuilder = &runtimescheme.Builder{GroupVersion: appsv1.SchemeGroupVersion} - scheme = runtime.NewScheme() + scheme = runtime.NewScheme() ) func init() { - SchemeBuilder.Register( - &appsv1.Deployment{}, - &appsv1.DeploymentList{}, - &appsv1.StatefulSet{}, - &appsv1.StatefulSetList{}, - ) - utilruntime.Must(SchemeBuilder.AddToScheme(scheme)) + utilruntime.Must(appsv1.AddToScheme(scheme)) + utilruntime.Must(v1.AddToScheme(scheme)) } func main() { @@ -61,12 +57,22 @@ func main() { var metricsAddr string var probeAddr string var syncPeriod time.Duration + var baseImageName string + var versionServer string + var versionChannel string + var insecureNoVerify bool + var disableLeaderElection bool flag.StringVar(&agentName, "agent-name", "", "The name of the agent that should be updated. This is mandatory.") flag.StringVar(&agentNamespace, "agent-namespace", "", "The namespace of the agent that should be updated. This is mandatory.") flag.StringVar(&metricsAddr, "metrics-addr", ":8080", "The address the metric endpoint binds to.") flag.StringVar(&probeAddr, "healthz-addr", ":8081", "The address the probe endpoint binds to.") flag.DurationVar(&syncPeriod, "sync-period", 10*time.Hour, "Operator sync period (format: https://pkg.go.dev/time#ParseDuration)") + flag.BoolVar(&insecureNoVerify, "insecure-no-verify-image", false, "Disable image signature verification.") + flag.BoolVar(&disableLeaderElection, "disable-leader-election", false, "Disable leader election, used when running the kube-agent-updater outside of Kubernetes.") + flag.StringVar(&versionServer, "version-server", "https://update.gravitational.io/v1/", "URL of the HTTP server advertising target version and critical maintenances. Trailing slash is optional.") + flag.StringVar(&versionChannel, "version-channel", "cloud/stable", "Version channel to get updates from.") + flag.StringVar(&baseImageName, "base-image", "public.ecr.aws/gravitational/teleport", "Image reference containing registry and repository.") opts := zap.Options{ Development: true, @@ -88,7 +94,7 @@ func main() { MetricsBindAddress: metricsAddr, Port: 9443, HealthProbeBindAddress: probeAddr, - LeaderElection: true, + LeaderElection: !disableLeaderElection, LeaderElectionID: agentName, Namespace: agentNamespace, SyncPeriod: &syncPeriod, @@ -108,16 +114,36 @@ func main() { os.Exit(1) } - // TODO: replace those mocks by the real thing - versionGetter := version.NewGetterMock("12.0.3", nil) - imageValidators := []img.Validator{ - img.NewImageValidatorMock("mock", true, img.NewImageRef("", "", "", "")), + versionServerURL, err := url.Parse(strings.TrimRight(versionServer, "/") + "/" + versionChannel) + if err != nil { + ctrl.Log.Error(err, "failed to pasre version server URL, exiting") + os.Exit(1) + } + versionGetter := version.NewBasicHTTPVersionGetter(versionServerURL) + maintenanceTriggers := maintenance.Triggers{ + maintenance.NewBasicHTTPMaintenanceTrigger("critical update", versionServerURL), + maintenance.NewUnhealthyWorkloadTrigger("unhealthy pods", mgr.GetClient()), + maintenance.NewWindowTrigger("maintenance window", mgr.GetClient()), + } + + var imageValidators img.Validators + if insecureNoVerify { + ctrl.Log.Info("INSECURE: Image validation disabled") + imageValidators = append(imageValidators, img.NewInsecureValidator("insecure always verify")) + } else { + validator, err := img.NewCosignSingleKeyValidator(teleportProdOCIPubKey, "cosign signature validator") + if err != nil { + ctrl.Log.Error(err, "failed to build image validator, exiting") + os.Exit(1) + } + imageValidators = append(imageValidators, validator) } - maintenanceTriggers := []maintenance.Trigger{ - maintenance.NewMaintenanceTriggerMock("never", false), + + baseImage, err := reference.ParseNamed(baseImageName) + if err != nil { + ctrl.Log.Error(err, "failed to parse base image reference, exiting") + os.Exit(1) } - baseImage, _ := reference.ParseNamed("public.ecr.aws/trent-playground/gravitational/teleport") - // End of mocks versionUpdater := controller.NewVersionUpdater(versionGetter, imageValidators, maintenanceTriggers, baseImage) diff --git a/integrations/kube-agent-updater/pkg/controller/constants.go b/integrations/kube-agent-updater/pkg/controller/constants.go index fe0bdea604bc4..72b36655f9d20 100644 --- a/integrations/kube-agent-updater/pkg/controller/constants.go +++ b/integrations/kube-agent-updater/pkg/controller/constants.go @@ -28,6 +28,9 @@ const ( defaultRequeue = 30 * time.Minute reconciliationTimeout = 2 * time.Minute kubeClientTimeout = 1 * time.Minute + // skipReconciliationAnnotation is inspired by the tenant-operator one + // (from the Teleport Cloud) but namespaced under `teleport.dev` + skipReconciliationAnnotation = "teleport.dev/skipreconcile" ) var ( diff --git a/integrations/kube-agent-updater/pkg/controller/deployment.go b/integrations/kube-agent-updater/pkg/controller/deployment.go index 7ddf5a7bcdae3..4f6b1d9a0cbba 100644 --- a/integrations/kube-agent-updater/pkg/controller/deployment.go +++ b/integrations/kube-agent-updater/pkg/controller/deployment.go @@ -55,6 +55,10 @@ func (r *DeploymentVersionUpdater) Reconcile(ctx context.Context, req ctrl.Reque } return ctrl.Result{}, trace.Wrap(err) } + if skipReconciliation(&obj) { + log.Info("Reconciliation disabled by resource annotations. Skipping.") + return requeueLater, nil + } // Get the current and past version currentVersion, err := getWorkloadVersion(obj.Spec.Template.Spec) diff --git a/integrations/kube-agent-updater/pkg/controller/statefulset.go b/integrations/kube-agent-updater/pkg/controller/statefulset.go index a9c651144f44d..93f9895323dbe 100644 --- a/integrations/kube-agent-updater/pkg/controller/statefulset.go +++ b/integrations/kube-agent-updater/pkg/controller/statefulset.go @@ -80,6 +80,10 @@ func (r *StatefulSetVersionUpdater) Reconcile(ctx context.Context, req ctrl.Requ } return ctrl.Result{}, trace.Wrap(err) } + if skipReconciliation(&obj) { + log.Info("Reconciliation disabled by resource annotations. Skipping.") + return requeueLater, nil + } // Get the current and past version currentVersion, err := getWorkloadVersion(obj.Spec.Template.Spec) diff --git a/integrations/kube-agent-updater/pkg/controller/utils.go b/integrations/kube-agent-updater/pkg/controller/utils.go index c28f7af5c582b..091f869a8a3a3 100644 --- a/integrations/kube-agent-updater/pkg/controller/utils.go +++ b/integrations/kube-agent-updater/pkg/controller/utils.go @@ -17,9 +17,12 @@ limitations under the License. package controller import ( + "strconv" + "github.com/docker/distribution/reference" "github.com/gravitational/trace" v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "github.com/gravitational/teleport/integrations/kube-agent-updater/pkg/version" ) @@ -65,3 +68,19 @@ func setContainerImageFromPodSpec(spec *v1.PodSpec, container, image string) err } return trace.NotFound("container %q not found in podSpec", container) } + +// skipReconciliation checks if the object has an annotation specifying that we +// must skip the reconciliation. Disabling reconciliation is useful for +// debugging purposes or when the user wants to suspend the updater for some +// reason. +func skipReconciliation(object metav1.Object) bool { + annotations := object.GetAnnotations() + if reconciliationAnnotation, ok := annotations[skipReconciliationAnnotation]; ok { + skip, err := strconv.ParseBool(reconciliationAnnotation) + if err != nil { + return false + } + return skip + } + return false +} diff --git a/integrations/kube-agent-updater/pkg/img/insecure.go b/integrations/kube-agent-updater/pkg/img/insecure.go new file mode 100644 index 0000000000000..1b30655917af1 --- /dev/null +++ b/integrations/kube-agent-updater/pkg/img/insecure.go @@ -0,0 +1,63 @@ +/* +Copyright 2023 Gravitational, Inc. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package img + +import ( + "context" + + "github.com/docker/distribution/reference" + "github.com/gravitational/trace" + "github.com/opencontainers/go-digest" +) + +type insecureValidator struct { + name string +} + +// Name returns the validator name +func (v *insecureValidator) Name() string { + return v.name +} + +// TODO: cache this to protect against registry quotas +// The image validation is only invoked when we are in a maintenance window and +// the target version is different than our current version. In regular usage we +// are called only once per update. However, Kubernetes controllers failure mode +// is usually infinite retry loop. If something fails after the image validation, +// we might get called in a loop indefinitely. To mitigate the impact of such +// failure, ValidateAndResolveDigest should cache its result. + +// ValidateAndResolveDigest resolves the image digest and always return the +// image is valid. Using this validator makes you vulnerable in case of image +// registry compromise. +func (v *insecureValidator) ValidateAndResolveDigest(ctx context.Context, image reference.NamedTagged) (NamedTaggedDigested, error) { + ref, err := NamedTaggedToDigest(image) + if err != nil { + return nil, trace.Wrap(err) + } + + digestedImage := NewImageRef(ref.RegistryStr(), ref.RepositoryStr(), image.Tag(), digest.Digest(ref.DigestStr())) + return digestedImage, nil +} + +// NewInsecureValidator returns an img.Validator that only resolves the image +// but does not check its signature. +func NewInsecureValidator(name string) Validator { + return &insecureValidator{ + name: name, + } +} diff --git a/integrations/kube-agent-updater/pkg/podutils/filter.go b/integrations/kube-agent-updater/pkg/podutils/filter.go index 625bbd788cd3f..b91bdf52cb1ee 100644 --- a/integrations/kube-agent-updater/pkg/podutils/filter.go +++ b/integrations/kube-agent-updater/pkg/podutils/filter.go @@ -63,7 +63,13 @@ func Not(filterFunc FilterFunc) FilterFunc { } } -const podReadinessGracePeriod = 10 * time.Minute +// podReadinessGracePeriod represents how much time we wait before we consider +// the pod (and a fortiori the workload) unhealthy. We might want to empirically +// tune this value. A higher value can lead to workloads being stuck longer in +// case of error. A shorter value might cause false positives and trigger +// updates because of other cluster-related events like network issues, registry +// downtime or missing capacity. +const podReadinessGracePeriod = 5 * time.Minute // IsUnhealthy checks if a pod has not been ready since at least 10 minutes/ // This heuristic also detects infrastructure issues like not enough room to