Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions cmd/gcp-routes-controller/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# gcp-routes-controller

## Background

Google cloud load balancer is a L3LB that is special. It doesn't do DNAT; instead, it
just redirects traffic to backends and preserves the VIP as the destination IP.

So, an agent exists on the node. It programs the node (either via iptables or routing tables) to
accept traffic destined for the VIP. However, this has a problem: all hairpin traffic
to the balanced servce is *always* handled by that backend, even if it is down
or otherwise out of rotation.

We want to withdraw the internal API service from google-routes redirection when
it's down, or else the node (i.e. kubelet) loses access to the apiserver VIP
and becomes unmanagable.

## Functionality

The gcp-routes-controller is installed on all the masters and monitors the
apiserver process /readyz.

When /readyz fails, stops the VIP routing by writing `/run/gcp-routes/VIP.down`,
which tells openshift-gcp-routes to skip that vip
155 changes: 129 additions & 26 deletions cmd/gcp-routes-controller/run.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,13 @@ import (
"crypto/tls"
"flag"
"fmt"
"net"
"net/http"
"net/url"
"os"
"os/exec"
"os/signal"
"path"
"sync"
"syscall"
"time"
Expand All @@ -32,16 +34,33 @@ var (
runOpts struct {
gcpRoutesService string
rootMount string

healthCheckURL string
healthCheckURL string
vip string
}
)

// downFileDir is the directory in which gcp-routes will look for a flag-file that
// indicates the route to the VIP should be withdrawn.
const downFileDir = "/run/gcp-routes"

func init() {
rootCmd.AddCommand(runCmd)
runCmd.PersistentFlags().StringVar(&runOpts.gcpRoutesService, "gcp-routes-service", "gcp-routes.service", "The name for the service controlling gcp routes on host")
runCmd.PersistentFlags().StringVar(&runOpts.rootMount, "root-mount", "/rootfs", "where the nodes root filesystem is mounted for chroot and file manipulation.")
runCmd.PersistentFlags().StringVar(&runOpts.gcpRoutesService, "gcp-routes-service", "openshift-gcp-routes.service", "The name for the service controlling gcp routes on host")
runCmd.PersistentFlags().StringVar(&runOpts.rootMount, "root-mount", "/rootfs", "where the nodes root filesystem is mounted for writing down files or chrooting.")
runCmd.PersistentFlags().StringVar(&runOpts.healthCheckURL, "health-check-url", "", "HTTP(s) URL for the health check")
runCmd.PersistentFlags().StringVar(&runOpts.vip, "vip", "", "The VIP to remove if the health check fails. Determined from URL if not provided")
}

type downMode int

const (
modeStopService = iota
modeDownFile
)

type handler struct {
mode downMode
vip string
}

func runRunCmd(cmd *cobra.Command, args []string) error {
Expand All @@ -51,18 +70,6 @@ func runRunCmd(cmd *cobra.Command, args []string) error {
// To help debugging, immediately log version
glog.Infof("Version: %+v (%s)", version.Raw, version.Hash)

if runOpts.rootMount != "" {
glog.Infof(`Calling chroot("%s")`, runOpts.rootMount)
if err := syscall.Chroot(runOpts.rootMount); err != nil {
return fmt.Errorf("Unable to chroot to %s: %s", runOpts.rootMount, err)
}

glog.V(2).Infof("Moving to / inside the chroot")
if err := os.Chdir("/"); err != nil {
return fmt.Errorf("Unable to change directory to /: %s", err)
}
}

uri, err := url.Parse(runOpts.healthCheckURL)
if err != nil {
return fmt.Errorf("failed to parse health-check-url: %v", err)
Expand All @@ -71,6 +78,14 @@ func runRunCmd(cmd *cobra.Command, args []string) error {
return fmt.Errorf("invalid URI %q (no scheme)", uri)
}

handler, err := newHandler(uri)
if err != nil {
return err
}

// The health check should always connect to localhost, not be load-balanced
uri.Host = net.JoinHostPort("localhost", uri.Port())

httpCheck, err := checkers.NewHTTP(&checkers.HTTPConfig{
URL: uri,
Client: &http.Client{Transport: &http.Transport{
Expand All @@ -82,22 +97,27 @@ func runRunCmd(cmd *cobra.Command, args []string) error {
if err != nil {
return fmt.Errorf("failed to create httpCheck: %v", err)
}

errCh := make(chan error)

// careful: the timing here needs to correspond to the load balancer's
// parameters. We need to remove routes just after we've been removed
// as a backend in the load-balancer, and add routes before we've been
// re-added.
// see openshift/installer/data/data/gcp/network/lb-private.tf
tracker := &healthTracker{
state: unknownTrackerState,
ErrCh: errCh,
SuccessThreshold: 2,
FailureThreshold: 10,
OnFailure: func() error { return exec.Command("systemctl", "stop", runOpts.gcpRoutesService).Run() },
OnSuccess: func() error { return exec.Command("systemctl", "start", runOpts.gcpRoutesService).Run() },
SuccessThreshold: 1,
FailureThreshold: 8, // LB = 6 seconds, plus 10 seconds for propagation
OnFailure: handler.onFailure,
OnSuccess: handler.onSuccess,
}

h := health.New()
h.AddChecks([]*health.Config{{
Name: "dependency-check",
Checker: httpCheck,
Interval: time.Duration(5) * time.Second,
Interval: time.Duration(2) * time.Second,
Fatal: true,
OnComplete: tracker.OnComplete,
}})
Expand All @@ -111,11 +131,10 @@ func runRunCmd(cmd *cobra.Command, args []string) error {
go func() {
for sig := range c {
glog.Infof("Signal %s received: shutting down gcp routes service", sig)
if err := exec.Command("systemctl", "stop", runOpts.gcpRoutesService).Run(); err != nil {
glog.Infof("Failed to terminate gcp routes service on signal: %s", err)
} else {
break
if err := handler.onFailure(); err != nil {
glog.Infof("Failed to mark service down on signal: %s", err)
}
os.Exit(0)
}
}()

Expand All @@ -129,6 +148,90 @@ func runRunCmd(cmd *cobra.Command, args []string) error {
}
}

func newHandler(uri *url.URL) (*handler, error) {
h := handler{}

// determine mode: if /run/gcp-routes exists, we can us the downfile mode
realPath := path.Join(runOpts.rootMount, downFileDir)
fi, err := os.Stat(realPath)
if err == nil && fi.IsDir() {
glog.Infof("%s exists, starting in downfile mode", realPath)
h.mode = modeDownFile
} else {
glog.Infof("%s not accessible, will stop gcp-routes.service on health failure", realPath)
h.mode = modeStopService
}

// if StopService mode and rootfs specified, chroot
if h.mode == modeStopService && runOpts.rootMount != "" {
glog.Infof(`Calling chroot("%s")`, runOpts.rootMount)
if err := syscall.Chroot(runOpts.rootMount); err != nil {
return nil, fmt.Errorf("unable to chroot to %s: %s", runOpts.rootMount, err)
}

glog.V(2).Infof("Moving to / inside the chroot")
if err := os.Chdir("/"); err != nil {
return nil, fmt.Errorf("unable to change directory to /: %s", err)
}
}

// otherwise, resolve vip
if h.mode == modeDownFile {
if runOpts.vip != "" {
h.vip = runOpts.vip
} else {
addrs, err := net.LookupHost(uri.Hostname())
if err != nil {
return nil, fmt.Errorf("failed to lookup host %s: %v", uri.Hostname(), err)
}
if len(addrs) != 1 {
return nil, fmt.Errorf("hostname %s has %d addresses, expected 1 - aborting", uri.Hostname(), len(addrs))
}
h.vip = addrs[0]
glog.Infof("Using VIP %s", h.vip)
}
}

return &h, nil
}

// onFailure: either stop the routes service, or write downfile
func (h *handler) onFailure() error {
if h.mode == modeDownFile {
downFile := path.Join(runOpts.rootMount, downFileDir, fmt.Sprintf("%s.down", h.vip))
fp, err := os.OpenFile(downFile, os.O_CREATE, 0644)
if err != nil {
return fmt.Errorf("failed to create downfile (%s): %v", downFile, err)
}
_ = fp.Close()
glog.Infof("healthcheck failed, created downfile %s", downFile)
} else {
if err := exec.Command("systemctl", "stop", runOpts.gcpRoutesService).Run(); err != nil {
return fmt.Errorf("Failed to terminate gcp routes service %v", err)
}
glog.Infof("healthcheck failed, stopped %s", runOpts.gcpRoutesService)
}
return nil
}

// onSuccess: either start routes service, or remove down file
func (h *handler) onSuccess() error {
if h.mode == modeDownFile {
downFile := path.Join(runOpts.rootMount, downFileDir, fmt.Sprintf("%s.down", h.vip))
err := os.Remove(downFile)
if err != nil && !os.IsNotExist(err) {
return fmt.Errorf("failed to remove downfile (%s): %v", downFile, err)
}
glog.Infof("healthcheck succeeded, removed downfile %s", downFile)
} else {
if err := exec.Command("systemctl", "start", runOpts.gcpRoutesService).Run(); err != nil {
return fmt.Errorf("Failed to terminate gcp routes service %v", err)
}
glog.Infof("healthcheck succeeded, started %s", runOpts.gcpRoutesService)
}
return nil
}

type trackerState int

const (
Expand Down
8 changes: 8 additions & 0 deletions pkg/controller/template/test_data/controller_config_aws.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,11 @@ spec:
setupEtcdEnv: image/setupEtcdEnv:1
infraImage: image/infraImage:1
kubeClientAgentImage: image/kubeClientAgentImage:1
infra:
apiVersion: config.openshift.io/v1
kind: Infrastructure
status:
apiServerInternalURI: https://api-int.my-test-cluster.installer.team.coreos.systems:6443
apiServerURL: https://api.my-test-cluster.installer.team.coreos.systems:6443
etcdDiscoveryDomain: my-test-cluster.installer.team.coreos.systems
infrastructureName: my-test-cluster
10 changes: 10 additions & 0 deletions pkg/controller/template/test_data/controller_config_baremetal.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,17 @@ spec:
infraImage: image/infraImage:1
kubeClientAgentImage: image/kubeClientAgentImage:1
infra:
apiVersion: config.openshift.io/v1
kind: Infrastructure
spec:
cloudConfig:
key: config
name: cloud-provider-config
status:
apiServerInternalURI: https://api-int.my-test-cluster.installer.team.coreos.systems:6443
apiServerURL: https://api.my-test-cluster.installer.team.coreos.systems:6443
etcdDiscoveryDomain: my-test-cluster.installer.team.coreos.systems
infrastructureName: my-test-cluster
platformStatus:
baremetal:
apiServerInternalIP: 10.0.0.1
Expand Down
10 changes: 9 additions & 1 deletion pkg/controller/template/test_data/controller_config_gcp.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,12 @@ spec:
etcd: image/etcd:1
setupEtcdEnv: image/setupEtcdEnv:1
infraImage: image/infraImage:1
kubeClientAgentImage: image/kubeClientAgentImage:1
kubeClientAgentImage: image/kubeClientAgentImage:1
infra:
apiVersion: config.openshift.io/v1
kind: Infrastructure
status:
apiServerInternalURI: https://api-int.my-test-cluster.installer.team.coreos.systems:6443
apiServerURL: https://api.my-test-cluster.installer.team.coreos.systems:6443
etcdDiscoveryDomain: my-test-cluster.installer.team.coreos.systems
infrastructureName: my-test-cluster
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,11 @@ spec:
setupEtcdEnv: image/setupEtcdEnv:1
infraImage: image/infraImage:1
kubeClientAgentImage: image/kubeClientAgentImage:1
infra:
apiVersion: config.openshift.io/v1
kind: Infrastructure
status:
apiServerInternalURI: https://api-int.my-test-cluster.installer.team.coreos.systems:6443
apiServerURL: https://api.my-test-cluster.installer.team.coreos.systems:6443
etcdDiscoveryDomain: my-test-cluster.installer.team.coreos.systems
infrastructureName: my-test-cluster
8 changes: 8 additions & 0 deletions pkg/controller/template/test_data/controller_config_none.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,11 @@ spec:
setupEtcdEnv: image/setupEtcdEnv:1
infraImage: image/infraImage:1
kubeClientAgentImage: image/kubeClientAgentImage:1
infra:
apiVersion: config.openshift.io/v1
kind: Infrastructure
status:
apiServerInternalURI: https://api-int.my-test-cluster.installer.team.coreos.systems:6443
apiServerURL: https://api.my-test-cluster.installer.team.coreos.systems:6443
etcdDiscoveryDomain: my-test-cluster.installer.team.coreos.systems
infrastructureName: my-test-cluster
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,13 @@ spec:
infraImage: image/infraImage:1
kubeClientAgentImage: image/kubeClientAgentImage:1
infra:
apiVersion: config.openshift.io/v1
kind: Infrastructure
status:
apiServerInternalURI: https://api-int.my-test-cluster.installer.team.coreos.systems:6443
apiServerURL: https://api.my-test-cluster.installer.team.coreos.systems:6443
etcdDiscoveryDomain: my-test-cluster.installer.team.coreos.systems
infrastructureName: my-test-cluster
platformStatus:
openstack:
apiServerInternalIP: 10.0.0.1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,11 @@ spec:
setupEtcdEnv: image/setupEtcdEnv:1
infraImage: image/infraImage:1
kubeClientAgentImage: image/kubeClientAgentImage:1
infra:
apiVersion: config.openshift.io/v1
kind: Infrastructure
status:
apiServerInternalURI: https://api-int.my-test-cluster.installer.team.coreos.systems:6443
apiServerURL: https://api.my-test-cluster.installer.team.coreos.systems:6443
etcdDiscoveryDomain: my-test-cluster.installer.team.coreos.systems
infrastructureName: my-test-cluster
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,13 @@ spec:
infraImage: image/infraImage:1
kubeClientAgentImage: image/kubeClientAgentImage:1
infra:
apiVersion: config.openshift.io/v1
kind: Infrastructure
status:
apiServerInternalURI: https://api-int.my-test-cluster.installer.team.coreos.systems:6443
apiServerURL: https://api.my-test-cluster.installer.team.coreos.systems:6443
etcdDiscoveryDomain: my-test-cluster.installer.team.coreos.systems
infrastructureName: my-test-cluster
platformStatus:
vsphere:
apiServerInternalIP: 10.0.0.1
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ contents:
command: ["gcp-routes-controller"]
args:
- "run"
- "--health-check-url=https://127.0.0.1:6443/readyz"
- "--health-check-url={{.Infra.Status.APIServerInternalURL}}/readyz"
resources:
requests:
cpu: 20m
Expand Down
Loading