diff --git a/README.md b/README.md index 184906604..6d2e7ce39 100644 --- a/README.md +++ b/README.md @@ -283,6 +283,15 @@ message and optionally stacktrace. For example, the startup message looks like: {"severity":"INFO","timestamp":"2020-10-12T07:20:50.52Z","caller":"cloud_sql_proxy/cloud_sql_proxy.go:510","message":"Using gcloud's active project: [my-project-id]"} ``` +#### `-use_http_health_check` + +Enables HTTP health checks for the proxy, including startup, liveness, and readiness probing. +Requires that you configure the Kubernetes container with HTTP probes ([instructions][health-check-example]). + +#### `-health_check_port=8090` + +Specifies the port that the health check server listens and serves on. Defaults to 8090. + ## Running as a Kubernetes Sidecar See the [example here][sidecar-example] as well as [Connecting from Google @@ -334,6 +343,7 @@ Install via Nuget, follow these [connect-to-k8s]: https://cloud.google.com/sql/docs/mysql/connect-kubernetes-engine [connection-overview]: https://cloud.google.com/sql/docs/mysql/connect-overview [contributing]: CONTRIBUTING.md +[health-check-example]: https://github.com/GoogleCloudPlatform/cloudsql-proxy/tree/main/examples/k8s-health-check#cloud-sql-proxy-health-checks [iam-auth]: https://cloud.google.com/sql/docs/postgres/authentication [pkg-badge]: https://pkg.go.dev/badge/github.com/GoogleCloudPlatform/cloudsql-proxy.svg [pkg-docs]: https://pkg.go.dev/github.com/GoogleCloudPlatform/cloudsql-proxy diff --git a/cmd/cloud_sql_proxy/cloud_sql_proxy.go b/cmd/cloud_sql_proxy/cloud_sql_proxy.go index 50b4fb577..84fca15d0 100644 --- a/cmd/cloud_sql_proxy/cloud_sql_proxy.go +++ b/cmd/cloud_sql_proxy/cloud_sql_proxy.go @@ -33,6 +33,7 @@ import ( "syscall" "time" + "github.com/GoogleCloudPlatform/cloudsql-proxy/cmd/cloud_sql_proxy/internal/healthcheck" "github.com/GoogleCloudPlatform/cloudsql-proxy/logging" "github.com/GoogleCloudPlatform/cloudsql-proxy/proxy/certs" "github.com/GoogleCloudPlatform/cloudsql-proxy/proxy/fuse" @@ -131,6 +132,10 @@ unavailable.`, `When set, the proxy uses this host as the base API path. Example: https://sqladmin.googleapis.com`, ) + + // Settings for healthcheck + useHTTPHealthCheck = flag.Bool("use_http_health_check", false, "When set, creates an HTTP server that checks and communicates the health of the proxy client.") + healthCheckPort = flag.String("health_check_port", "8090", "When applicable, health checks take place on this port number. Defaults to 8090.") ) const ( @@ -580,6 +585,16 @@ func main() { RefreshCfgBuffer: refreshCfgBuffer, } + var hc *healthcheck.Server + if *useHTTPHealthCheck { + hc, err = healthcheck.NewServer(proxyClient, *healthCheckPort) + if err != nil { + logging.Errorf("Could not initialize health check server: %v", err) + os.Exit(1) + } + defer hc.Close(ctx) + } + // Initialize a source of new connections to Cloud SQL instances. var connSrc <-chan proxy.Conn if *useFuse { @@ -619,6 +634,10 @@ func main() { logging.Infof("Ready for new connections") + if hc != nil { + hc.NotifyStarted() + } + signals := make(chan os.Signal, 1) signal.Notify(signals, syscall.SIGTERM, syscall.SIGINT) diff --git a/cmd/cloud_sql_proxy/internal/healthcheck/healthcheck.go b/cmd/cloud_sql_proxy/internal/healthcheck/healthcheck.go new file mode 100644 index 000000000..3bb956af8 --- /dev/null +++ b/cmd/cloud_sql_proxy/internal/healthcheck/healthcheck.go @@ -0,0 +1,153 @@ +// Copyright 2021 Google LLC All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package healthcheck tests and communicates the health of the Cloud SQL Auth proxy. +package healthcheck + +import ( + "context" + "errors" + "net" + "net/http" + "sync" + + "github.com/GoogleCloudPlatform/cloudsql-proxy/logging" + "github.com/GoogleCloudPlatform/cloudsql-proxy/proxy/proxy" +) + +const ( + startupPath = "/startup" + livenessPath = "/liveness" + readinessPath = "/readiness" +) + +// Server is a type used to implement health checks for the proxy. +type Server struct { + // started is used to indicate whether the proxy has finished starting up. + // If started is open, startup has not finished. If started is closed, + // startup is complete. + started chan struct{} + // once ensures that started can only be closed once. + once *sync.Once + // port designates the port number on which Server listens and serves. + port string + // srv is a pointer to the HTTP server used to communicate proxy health. + srv *http.Server +} + +// NewServer initializes a Server and exposes HTTP endpoints used to +// communicate proxy health. +func NewServer(c *proxy.Client, port string) (*Server, error) { + mux := http.NewServeMux() + + srv := &http.Server{ + Addr: ":" + port, + Handler: mux, + } + + hcServer := &Server{ + started: make(chan struct{}), + once: &sync.Once{}, + port: port, + srv: srv, + } + + mux.HandleFunc(startupPath, func(w http.ResponseWriter, _ *http.Request) { + if !hcServer.proxyStarted() { + w.WriteHeader(http.StatusServiceUnavailable) + w.Write([]byte("error")) + return + } + w.WriteHeader(http.StatusOK) + w.Write([]byte("ok")) + }) + + mux.HandleFunc(readinessPath, func(w http.ResponseWriter, _ *http.Request) { + if !isReady(c, hcServer) { + w.WriteHeader(http.StatusServiceUnavailable) + w.Write([]byte("error")) + return + } + w.WriteHeader(http.StatusOK) + w.Write([]byte("ok")) + }) + + mux.HandleFunc(livenessPath, func(w http.ResponseWriter, _ *http.Request) { + if !isLive() { // Because isLive() always returns true, this case should not be reached. + w.WriteHeader(http.StatusServiceUnavailable) + w.Write([]byte("error")) + return + } + w.WriteHeader(http.StatusOK) + w.Write([]byte("ok")) + }) + + ln, err := net.Listen("tcp", srv.Addr) + if err != nil { + return nil, err + } + + go func() { + if err := srv.Serve(ln); err != nil && !errors.Is(err, http.ErrServerClosed) { + logging.Errorf("Failed to start health check HTTP server: %v", err) + } + }() + + return hcServer, nil +} + +// Close gracefully shuts down the HTTP server belonging to the Server. +func (s *Server) Close(ctx context.Context) error { + return s.srv.Shutdown(ctx) +} + +// NotifyStarted tells the Server that the proxy has finished startup. +func (s *Server) NotifyStarted() { + s.once.Do(func() { close(s.started) }) +} + +// proxyStarted returns true if started is closed, false otherwise. +func (s *Server) proxyStarted() bool { + select { + case <-s.started: + return true + default: + return false + } +} + +// isLive returns true as long as the proxy is running. +func isLive() bool { + return true +} + +// isReady will check the following criteria before determining whether the +// proxy is ready for new connections. +// 1. Finished starting up / been sent the 'Ready for Connections' log. +// 2. Not yet hit the MaxConnections limit, if applicable. +func isReady(c *proxy.Client, s *Server) bool { + // Not ready until we reach the 'Ready for Connections' log + if !s.proxyStarted() { + logging.Errorf("Readiness failed because proxy has not finished starting up.") + return false + } + + // Not ready if the proxy is at the optional MaxConnections limit. + if !c.AvailableConn() { + logging.Errorf("Readiness failed because proxy has reached the maximum connections limit (%d).", c.MaxConnections) + return false + } + + return true +} diff --git a/cmd/cloud_sql_proxy/internal/healthcheck/healthcheck_test.go b/cmd/cloud_sql_proxy/internal/healthcheck/healthcheck_test.go new file mode 100644 index 000000000..f0b2a8d3f --- /dev/null +++ b/cmd/cloud_sql_proxy/internal/healthcheck/healthcheck_test.go @@ -0,0 +1,155 @@ +// Copyright 2021 Google LLC All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package healthcheck_test + +import ( + "context" + "net/http" + "testing" + + "github.com/GoogleCloudPlatform/cloudsql-proxy/cmd/cloud_sql_proxy/internal/healthcheck" + "github.com/GoogleCloudPlatform/cloudsql-proxy/proxy/proxy" +) + +const ( + startupPath = "/startup" + livenessPath = "/liveness" + readinessPath = "/readiness" + testPort = "8090" +) + +// Test to verify that when the proxy client is up, the liveness endpoint writes http.StatusOK. +func TestLiveness(t *testing.T) { + s, err := healthcheck.NewServer(&proxy.Client{}, testPort) + if err != nil { + t.Fatalf("Could not initialize health check: %v", err) + } + defer s.Close(context.Background()) + + resp, err := http.Get("http://localhost:" + testPort + livenessPath) + if err != nil { + t.Fatalf("HTTP GET failed: %v", err) + } + if resp.StatusCode != http.StatusOK { + t.Errorf("Got status code %v instead of %v", resp.StatusCode, http.StatusOK) + } +} + +// Test to verify that when startup has NOT finished, the startup and readiness endpoints write +// http.StatusServiceUnavailable. +func TestStartupFail(t *testing.T) { + s, err := healthcheck.NewServer(&proxy.Client{}, testPort) + if err != nil { + t.Fatalf("Could not initialize health check: %v\n", err) + } + defer s.Close(context.Background()) + + resp, err := http.Get("http://localhost:" + testPort + startupPath) + if err != nil { + t.Fatalf("HTTP GET failed: %v\n", err) + } + if resp.StatusCode != http.StatusServiceUnavailable { + t.Errorf("%v returned status code %v instead of %v", startupPath, resp.StatusCode, http.StatusServiceUnavailable) + } + + resp, err = http.Get("http://localhost:" + testPort + readinessPath) + if err != nil { + t.Fatalf("HTTP GET failed: %v\n", err) + } + if resp.StatusCode != http.StatusServiceUnavailable { + t.Errorf("%v returned status code %v instead of %v", readinessPath, resp.StatusCode, http.StatusServiceUnavailable) + } +} + +// Test to verify that when startup HAS finished (and MaxConnections limit not specified), +// the startup and readiness endpoints write http.StatusOK. +func TestStartupPass(t *testing.T) { + s, err := healthcheck.NewServer(&proxy.Client{}, testPort) + if err != nil { + t.Fatalf("Could not initialize health check: %v\n", err) + } + defer s.Close(context.Background()) + + // Simulate the proxy client completing startup. + s.NotifyStarted() + + resp, err := http.Get("http://localhost:" + testPort + startupPath) + if err != nil { + t.Fatalf("HTTP GET failed: %v\n", err) + } + if resp.StatusCode != http.StatusOK { + t.Errorf("%v returned status code %v instead of %v", startupPath, resp.StatusCode, http.StatusOK) + } + + resp, err = http.Get("http://localhost:" + testPort + readinessPath) + if err != nil { + t.Fatalf("HTTP GET failed: %v\n", err) + } + if resp.StatusCode != http.StatusOK { + t.Errorf("%v returned status code %v instead of %v", readinessPath, resp.StatusCode, http.StatusOK) + } +} + +// Test to verify that when startup has finished, but MaxConnections has been reached, +// the readiness endpoint writes http.StatusServiceUnavailable. +func TestMaxConnectionsReached(t *testing.T) { + c := &proxy.Client{ + MaxConnections: 1, + } + s, err := healthcheck.NewServer(c, testPort) + if err != nil { + t.Fatalf("Could not initialize health check: %v", err) + } + defer s.Close(context.Background()) + + s.NotifyStarted() + c.ConnectionsCounter = c.MaxConnections // Simulate reaching the limit for maximum number of connections + + resp, err := http.Get("http://localhost:" + testPort + readinessPath) + if err != nil { + t.Fatalf("HTTP GET failed: %v", err) + } + if resp.StatusCode != http.StatusServiceUnavailable { + t.Errorf("Got status code %v instead of %v", resp.StatusCode, http.StatusServiceUnavailable) + } +} + +// Test to verify that after closing a healthcheck, its liveness endpoint serves +// an error. +func TestCloseHealthCheck(t *testing.T) { + s, err := healthcheck.NewServer(&proxy.Client{}, testPort) + if err != nil { + t.Fatalf("Could not initialize health check: %v", err) + } + defer s.Close(context.Background()) + + resp, err := http.Get("http://localhost:" + testPort + livenessPath) + if err != nil { + t.Fatalf("HTTP GET failed: %v", err) + } + if resp.StatusCode != http.StatusOK { + t.Errorf("Got status code %v instead of %v", resp.StatusCode, http.StatusOK) + } + + err = s.Close(context.Background()) + if err != nil { + t.Fatalf("Failed to close health check: %v", err) + } + + _, err = http.Get("http://localhost:" + testPort + livenessPath) + if err == nil { + t.Fatalf("HTTP GET did not return error after closing health check server.") + } +} diff --git a/examples/k8s-health-check/README.md b/examples/k8s-health-check/README.md new file mode 100644 index 000000000..a8fd01889 --- /dev/null +++ b/examples/k8s-health-check/README.md @@ -0,0 +1,70 @@ +# Cloud SQL proxy health checks + +Kubernetes supports three types of health checks. +1. Startup probes determine whether a container is done starting up. As soon as this probe succeeds, Kubernetes switches over to using liveness and readiness probing. +2. Liveness probes determine whether a container is healthy. When this probe is unsuccessful, the container is restarted. +3. Readiness probes determine whether a container can serve new traffic. When this probe fails, Kubernetes will wait to send requests to the container. + +## Running Cloud SQL proxy with health checks in Kubernetes +1. Configure your Cloud SQL proxy container to include health check probes. + > [proxy_with_http_health_check.yaml](proxy_with_http_health_check.yaml#L77-L111) + ```yaml + # Recommended configurations for health check probes. + # Probe parameters can be adjusted to best fit the requirements of your application. + # For details, see https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/ + livenessProbe: + httpGet: + path: /liveness + port: 8090 + # Number of seconds after the container has started before the first probe is scheduled. Defaults to 0. + # Not necessary when the startup probe is in use. + initialDelaySeconds: 0 + # Frequency of the probe. Defaults to 10. + periodSeconds: 10 + # Number of seconds after which the probe times out. Defaults to 1. + timeoutSeconds: 5 + # Number of times the probe is allowed to fail before the transition from healthy to failure state. + # Defaults to 3. + failureThreshold: 1 + readinessProbe: + httpGet: + path: /liveness + port: 8090 + initialDelaySeconds: 0 + periodSeconds: 10 + timeoutSeconds: 5 + # Number of times the probe must report success to transition from failure to healthy state. + # Defaults to 1 for readiness probe. + successThreshold: 1 + failureThreshold: 1 + startupProbe: + httpGet: + path: /startup + port: 8090 + periodSeconds: 1 + timeoutSeconds: 5 + failureThreshold: 20 + ``` + +2. Add `-use_http_health_check` and `-health-check-port` (optional) to your proxy container configuration under `command: `. + > [proxy_with_http_health_check.yaml](proxy_with_http_health_check.yaml#L39-L55) + ```yaml + command: + - "/cloud_sql_proxy" + + # If connecting from a VPC-native GKE cluster, you can use the + # following flag to have the proxy connect over private IP + # - "-ip_address_types=PRIVATE" + + # Replace DB_PORT with the port the proxy should listen on + # Defaults: MySQL: 3306, Postgres: 5432, SQLServer: 1433 + - "-instances==tcp:" + # Enables HTTP health checks. + - "-use_http_health_check" + # Specifies the health check server port. + # Defaults to 8090. + - "-health_check_port=" + # This flag specifies where the service account key can be found + - "-credential_file=/secrets/service_account.json" + ``` + diff --git a/examples/k8s-health-check/proxy_with_http_health_check.yaml b/examples/k8s-health-check/proxy_with_http_health_check.yaml new file mode 100644 index 000000000..537771256 --- /dev/null +++ b/examples/k8s-health-check/proxy_with_http_health_check.yaml @@ -0,0 +1,115 @@ +# You must configure probes in your deployment to use health checks in Kubernetes. +# This sample configuration for HTTP probes is adapted from proxy_with_workload_identity.yaml. +apiVersion: apps/v1 +kind: Deployment +metadata: + name: +spec: + selector: + matchLabels: + app: + template: + metadata: + labels: + app: + spec: + containers: + - name: + # ... other container configuration + env: + - name: DB_USER + valueFrom: + secretKeyRef: + name: + key: username + - name: DB_PASS + valueFrom: + secretKeyRef: + name: + key: password + - name: DB_NAME + valueFrom: + secretKeyRef: + name: + key: database + - name: cloud-sql-proxy + # It is recommended to use the latest version of the Cloud SQL proxy + # Make sure to update on a regular schedule! + image: gcr.io/cloudsql-docker/gce-proxy:1.17 + command: + - "/cloud_sql_proxy" + + # If connecting from a VPC-native GKE cluster, you can use the + # following flag to have the proxy connect over private IP + # - "-ip_address_types=PRIVATE" + + # Replace DB_PORT with the port the proxy should listen on + # Defaults: MySQL: 3306, Postgres: 5432, SQLServer: 1433 + - "-instances==tcp:" + # Enables HTTP health checks. + - "-use_http_health_check" + # Specifies the health check server port. + # Defaults to 8090. + - "-health_check_port=" + # This flag specifies where the service account key can be found + - "-credential_file=/secrets/service_account.json" + securityContext: + # The default Cloud SQL proxy image runs as the + # "nonroot" user and group (uid: 65532) by default. + runAsNonRoot: true + volumeMounts: + - name: + mountPath: /secrets/ + readOnly: true + # Resource configuration depends on an application's requirements. You + # should adjust the following values based on what your application + # needs. For details, see https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + resources: + requests: + # The proxy's memory use scales linearly with the number of active + # connections. Fewer open connections will use less memory. Adjust + # this value based on your application's requirements. + memory: "2Gi" + # The proxy's CPU use scales linearly with the amount of IO between + # the database and the application. Adjust this value based on your + # application's requirements. + cpu: "1" + # Recommended configurations for health check probes. + # Probe parameters can be adjusted to best fit the requirements of your application. + # For details, see https://kubernetes.io/docs/tasks/configure-pod-container/configure-liveness-readiness-startup-probes/ + livenessProbe: + httpGet: + path: /liveness + port: 8090 + # Number of seconds after the container has started before the first probe is scheduled. Defaults to 0. + # Not necessary when the startup probe is in use. + initialDelaySeconds: 0 + # Frequency of the probe. Defaults to 10. + periodSeconds: 10 + # Number of seconds after which the probe times out. Defaults to 1. + timeoutSeconds: 5 + # Number of times the probe is allowed to fail before the transition from healthy to failure state. + # Defaults to 3. + failureThreshold: 1 + readinessProbe: + httpGet: + path: /liveness + port: 8090 + initialDelaySeconds: 0 + periodSeconds: 10 + timeoutSeconds: 5 + # Number of times the probe must report success to transition from failure to healthy state. + # Defaults to 1 for readiness probe. + successThreshold: 1 + failureThreshold: 1 + startupProbe: + httpGet: + path: /startup + port: 8090 + periodSeconds: 1 + timeoutSeconds: 5 + failureThreshold: 20 + volumes: + - name: + secret: + secretName: diff --git a/proxy/proxy/client.go b/proxy/proxy/client.go index 9b82d6789..0f80fc2aa 100644 --- a/proxy/proxy/client.go +++ b/proxy/proxy/client.go @@ -522,6 +522,12 @@ func (c *Client) InstanceVersionContext(ctx context.Context, instance string) (s return version, nil } +// AvailableConn returns false if MaxConnections has been reached, true otherwise. +// When MaxConnections is 0, there is no limit. +func (c *Client) AvailableConn() bool { + return c.MaxConnections == 0 || atomic.LoadUint64(&c.ConnectionsCounter) < c.MaxConnections +} + // Shutdown waits up to a given amount of time for all active connections to // close. Returns an error if there are still active connections after waiting // for the whole length of the timeout.