Skip to content

Commit

Permalink
Add OFSwitch connection check to Agent's liveness probes (#4447)
Browse files Browse the repository at this point in the history
This helps automatic recovery if some issues cause OFSwitch reconnection to
not work properly.

It also fixes a race condition between the IsConnected and SwitchConnected
methods of OFBridge and makes necessary changes to the constructor of
APIServer to allow testing.

For #4092

Signed-off-by: Quan Tian <[email protected]>
Co-authored-by: Xu Liu <[email protected]>
  • Loading branch information
tnqn and xliuxu committed Dec 16, 2022
1 parent 1ee8db4 commit 40bebc4
Show file tree
Hide file tree
Showing 15 changed files with 475 additions and 65 deletions.
12 changes: 6 additions & 6 deletions build/charts/antrea/templates/agent/daemonset.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -175,12 +175,12 @@ spec:
name: api
protocol: TCP
livenessProbe:
exec:
command:
- /bin/sh
- -c
- container_liveness_probe agent
initialDelaySeconds: 5
httpGet:
host: localhost
path: /livez
port: api
scheme: HTTPS
initialDelaySeconds: 10
timeoutSeconds: 5
periodSeconds: 10
failureThreshold: 5
Expand Down
4 changes: 1 addition & 3 deletions build/images/scripts/container_liveness_probe
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,7 @@

source daemon_status

if [ $1 == agent ]; then
exit 0
elif [ $1 == ovs ]; then
if [ $1 == ovs ]; then
check_ovs_status && exit 0
elif [ $1 == ovs-ipsec ]; then
check_ovs_ipsec_status && exit 0
Expand Down
12 changes: 6 additions & 6 deletions build/yamls/antrea-aks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3749,12 +3749,12 @@ spec:
name: api
protocol: TCP
livenessProbe:
exec:
command:
- /bin/sh
- -c
- container_liveness_probe agent
initialDelaySeconds: 5
httpGet:
host: localhost
path: /livez
port: api
scheme: HTTPS
initialDelaySeconds: 10
timeoutSeconds: 5
periodSeconds: 10
failureThreshold: 5
Expand Down
12 changes: 6 additions & 6 deletions build/yamls/antrea-eks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3751,12 +3751,12 @@ spec:
name: api
protocol: TCP
livenessProbe:
exec:
command:
- /bin/sh
- -c
- container_liveness_probe agent
initialDelaySeconds: 5
httpGet:
host: localhost
path: /livez
port: api
scheme: HTTPS
initialDelaySeconds: 10
timeoutSeconds: 5
periodSeconds: 10
failureThreshold: 5
Expand Down
12 changes: 6 additions & 6 deletions build/yamls/antrea-gke.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3748,12 +3748,12 @@ spec:
name: api
protocol: TCP
livenessProbe:
exec:
command:
- /bin/sh
- -c
- container_liveness_probe agent
initialDelaySeconds: 5
httpGet:
host: localhost
path: /livez
port: api
scheme: HTTPS
initialDelaySeconds: 10
timeoutSeconds: 5
periodSeconds: 10
failureThreshold: 5
Expand Down
12 changes: 6 additions & 6 deletions build/yamls/antrea-ipsec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3768,12 +3768,12 @@ spec:
name: api
protocol: TCP
livenessProbe:
exec:
command:
- /bin/sh
- -c
- container_liveness_probe agent
initialDelaySeconds: 5
httpGet:
host: localhost
path: /livez
port: api
scheme: HTTPS
initialDelaySeconds: 10
timeoutSeconds: 5
periodSeconds: 10
failureThreshold: 5
Expand Down
12 changes: 6 additions & 6 deletions build/yamls/antrea.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3748,12 +3748,12 @@ spec:
name: api
protocol: TCP
livenessProbe:
exec:
command:
- /bin/sh
- -c
- container_liveness_probe agent
initialDelaySeconds: 5
httpGet:
host: localhost
path: /livez
port: api
scheme: HTTPS
initialDelaySeconds: 10
timeoutSeconds: 5
periodSeconds: 10
failureThreshold: 5
Expand Down
19 changes: 11 additions & 8 deletions cmd/antrea-agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"k8s.io/apimachinery/pkg/fields"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/apiserver/pkg/server/options"
"k8s.io/client-go/informers"
coreinformers "k8s.io/client-go/informers/core/v1"
"k8s.io/client-go/tools/cache"
Expand Down Expand Up @@ -69,7 +70,6 @@ import (
"antrea.io/antrea/pkg/ovs/ovsctl"
"antrea.io/antrea/pkg/signals"
"antrea.io/antrea/pkg/util/channel"
"antrea.io/antrea/pkg/util/cipher"
"antrea.io/antrea/pkg/util/env"
"antrea.io/antrea/pkg/util/k8s"
"antrea.io/antrea/pkg/version"
Expand Down Expand Up @@ -685,20 +685,23 @@ func run(o *Options) error {

go agentMonitor.Run(stopCh)

cipherSuites, err := cipher.GenerateCipherSuitesList(o.config.TLSCipherSuites)
if err != nil {
return fmt.Errorf("error generating Cipher Suite list: %v", err)
}
secureServing := options.NewSecureServingOptions().WithLoopback()
secureServing.BindAddress = net.IPv4zero
secureServing.BindPort = o.config.APIPort
secureServing.CipherSuites = o.tlsCipherSuites
secureServing.MinTLSVersion = o.config.TLSMinVersion
authentication := options.NewDelegatingAuthenticationOptions()
authorization := options.NewDelegatingAuthorizationOptions().WithAlwaysAllowPaths("/healthz", "/livez", "/readyz")
apiServer, err := apiserver.New(
agentQuerier,
networkPolicyController,
mcastController,
externalIPController,
o.config.APIPort,
secureServing,
authentication,
authorization,
*o.config.EnablePrometheusMetrics,
o.config.ClientConnection.Kubeconfig,
cipherSuites,
cipher.TLSVersionMap[o.config.TLSMinVersion],
v4Enabled,
v6Enabled)
if err != nil {
Expand Down
24 changes: 24 additions & 0 deletions cmd/antrea-agent/options.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (

"github.com/spf13/pflag"
"gopkg.in/yaml.v2"
cliflag "k8s.io/component-base/cli/flag"
"k8s.io/klog/v2"

"antrea.io/antrea/pkg/agent/config"
Expand Down Expand Up @@ -58,6 +59,8 @@ type Options struct {
configFile string
// The configuration object
config *agentconfig.AgentConfig
// tlsCipherSuites is a slice of TLSCipherSuites mapped to input provided by user.
tlsCipherSuites []string
// IPFIX flow collector address
flowCollectorAddr string
// IPFIX flow collector protocol
Expand Down Expand Up @@ -140,6 +143,10 @@ func (o *Options) validate(args []string) error {
return err
}

if err := o.validateTLSOptions(); err != nil {
return err
}

if encapMode.SupportsNoEncap() {
// When using NoEncap traffic mode without AntreaProxy, Pod-to-Service traffic is handled by kube-proxy
// (iptables/ipvs) in the root netns. If the Endpoint is not local the DNATed traffic will be output to
Expand Down Expand Up @@ -311,6 +318,23 @@ func (o *Options) setDefaults() {
}
}

func (o *Options) validateTLSOptions() error {
_, err := cliflag.TLSVersion(o.config.TLSMinVersion)
if err != nil {
return fmt.Errorf("invalid TLSMinVersion: %v", err)
}
trimmedTLSCipherSuites := strings.ReplaceAll(o.config.TLSCipherSuites, " ", "")
if trimmedTLSCipherSuites != "" {
tlsCipherSuites := strings.Split(trimmedTLSCipherSuites, ",")
_, err = cliflag.TLSCipherSuites(tlsCipherSuites)
if err != nil {
return fmt.Errorf("invalid TLSCipherSuites: %v", err)
}
o.tlsCipherSuites = tlsCipherSuites
}
return nil
}

func (o *Options) validateAntreaProxyConfig() error {
if !features.DefaultFeatureGate.Enabled(features.AntreaProxy) {
// Validate service CIDR configuration if AntreaProxy is not enabled.
Expand Down
75 changes: 75 additions & 0 deletions cmd/antrea-agent/options_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
// Copyright 2022 Antrea Authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package main

import (
"testing"

"github.com/stretchr/testify/assert"

agentconfig "antrea.io/antrea/pkg/config/agent"
)

func TestOptionsValidateTLSOptions(t *testing.T) {
tests := []struct {
name string
config *agentconfig.AgentConfig
expectedErr string
}{
{
name: "empty input",
config: &agentconfig.AgentConfig{
TLSCipherSuites: "",
TLSMinVersion: "",
},
expectedErr: "",
},
{
name: "invalid TLSMinVersion",
config: &agentconfig.AgentConfig{
TLSCipherSuites: "",
TLSMinVersion: "foo",
},
expectedErr: "invalid TLSMinVersion",
},
{
name: "invalid TLSCipherSuites",
config: &agentconfig.AgentConfig{
TLSCipherSuites: "TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305, foo",
TLSMinVersion: "VersionTLS10",
},
expectedErr: "invalid TLSCipherSuites",
},
{
name: "valid input",
config: &agentconfig.AgentConfig{
TLSCipherSuites: "TLS_ECDHE_RSA_WITH_CHACHA20_POLY1305, TLS_RSA_WITH_AES_128_GCM_SHA256",
TLSMinVersion: "VersionTLS12",
},
expectedErr: "",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
o := &Options{config: tt.config}
err := o.validateTLSOptions()
if tt.expectedErr == "" {
assert.NoError(t, err)
} else {
assert.ErrorContains(t, err, tt.expectedErr)
}
})
}
}
46 changes: 32 additions & 14 deletions pkg/agent/apiserver/apiserver.go
Original file line number Diff line number Diff line change
Expand Up @@ -99,30 +99,41 @@ func installAPIGroup(s *genericapiserver.GenericAPIServer, aq agentquerier.Agent
}

// New creates an APIServer for running in antrea agent.
func New(aq agentquerier.AgentQuerier, npq querier.AgentNetworkPolicyInfoQuerier, mq querier.AgentMulticastInfoQuerier, seipq querier.ServiceExternalIPStatusQuerier,
bindPort int, enableMetrics bool, kubeconfig string, cipherSuites []uint16, tlsMinVersion uint16, v4Enabled, v6Enabled bool) (*agentAPIServer, error) {
cfg, err := newConfig(npq, bindPort, enableMetrics, kubeconfig)
func New(aq agentquerier.AgentQuerier,
npq querier.AgentNetworkPolicyInfoQuerier,
mq querier.AgentMulticastInfoQuerier,
seipq querier.ServiceExternalIPStatusQuerier,
secureServing *genericoptions.SecureServingOptionsWithLoopback,
authentication *genericoptions.DelegatingAuthenticationOptions,
authorization *genericoptions.DelegatingAuthorizationOptions,
enableMetrics bool,
kubeconfig string,
v4Enabled,
v6Enabled bool,
) (*agentAPIServer, error) {
cfg, err := newConfig(aq, npq, secureServing, authentication, authorization, enableMetrics, kubeconfig)
if err != nil {
return nil, err
}
s, err := cfg.New(Name, genericapiserver.NewEmptyDelegate())
if err != nil {
return nil, err
}
s.SecureServingInfo.CipherSuites = cipherSuites
s.SecureServingInfo.MinTLSVersion = tlsMinVersion
if err := installAPIGroup(s, aq, npq, v4Enabled, v6Enabled); err != nil {
return nil, err
}
installHandlers(aq, npq, mq, seipq, s)
return &agentAPIServer{GenericAPIServer: s}, nil
}

func newConfig(npq querier.AgentNetworkPolicyInfoQuerier, bindPort int, enableMetrics bool, kubeconfig string) (*genericapiserver.CompletedConfig, error) {
secureServing := genericoptions.NewSecureServingOptions().WithLoopback()
authentication := genericoptions.NewDelegatingAuthenticationOptions()
authorization := genericoptions.NewDelegatingAuthorizationOptions().WithAlwaysAllowPaths("/healthz", "/livez", "/readyz")

func newConfig(aq agentquerier.AgentQuerier,
npq querier.AgentNetworkPolicyInfoQuerier,
secureServing *genericoptions.SecureServingOptionsWithLoopback,
authentication *genericoptions.DelegatingAuthenticationOptions,
authorization *genericoptions.DelegatingAuthorizationOptions,
enableMetrics bool,
kubeconfig string,
) (*genericapiserver.CompletedConfig, error) {
// kubeconfig file is useful when antrea-agent isn't running as a Pod.
if len(kubeconfig) > 0 {
authentication.RemoteKubeConfigFile = kubeconfig
Expand All @@ -132,8 +143,6 @@ func newConfig(npq querier.AgentNetworkPolicyInfoQuerier, bindPort int, enableMe
// Set the PairName but leave certificate directory blank to generate in-memory by default.
secureServing.ServerCert.CertDirectory = ""
secureServing.ServerCert.PairName = Name
secureServing.BindAddress = net.IPv4zero
secureServing.BindPort = bindPort

if err := secureServing.MaybeDefaultWithSelfSignedCerts("localhost", nil, []net.IP{net.ParseIP("127.0.0.1"), net.IPv6loopback}); err != nil {
return nil, fmt.Errorf("error creating self-signed certificates: %v", err)
Expand Down Expand Up @@ -164,13 +173,22 @@ func newConfig(npq querier.AgentNetworkPolicyInfoQuerier, bindPort int, enableMe
}
serverConfig.EnableMetrics = enableMetrics
// Add readiness probe to check the status of watchers.
check := healthz.NamedCheck("watcher", func(_ *http.Request) error {
watcherCheck := healthz.NamedCheck("watcher", func(_ *http.Request) error {
if npq.GetControllerConnectionStatus() {
return nil
}
return fmt.Errorf("some watchers may not be connected")
})
serverConfig.ReadyzChecks = append(serverConfig.ReadyzChecks, check)
serverConfig.ReadyzChecks = append(serverConfig.ReadyzChecks, watcherCheck)
// Add liveness probe to check the connection with OFSwitch.
// This helps automatic recovery if some issues cause OFSwitch reconnection to not work properly, e.g. issue #4092.
ovsConnCheck := healthz.NamedCheck("ovs", func(_ *http.Request) error {
if aq.GetOpenflowClient().IsConnected() {
return nil
}
return fmt.Errorf("disconnected from OFSwitch")
})
serverConfig.LivezChecks = append(serverConfig.LivezChecks, ovsConnCheck)

completedServerCfg := serverConfig.Complete(nil)
return &completedServerCfg, nil
Expand Down
Loading

0 comments on commit 40bebc4

Please sign in to comment.